groupmq-plus 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +59 -0
- package/README.md +722 -0
- package/dist/index.cjs +2567 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +1300 -0
- package/dist/index.d.ts +1300 -0
- package/dist/index.js +2557 -0
- package/dist/index.js.map +1 -0
- package/dist/lua/change-delay.lua +62 -0
- package/dist/lua/check-stalled.lua +86 -0
- package/dist/lua/clean-status.lua +64 -0
- package/dist/lua/cleanup-poisoned-group.lua +46 -0
- package/dist/lua/cleanup.lua +46 -0
- package/dist/lua/complete-and-reserve-next-with-metadata.lua +221 -0
- package/dist/lua/complete-with-metadata.lua +190 -0
- package/dist/lua/complete.lua +51 -0
- package/dist/lua/dead-letter.lua +86 -0
- package/dist/lua/enqueue-batch.lua +149 -0
- package/dist/lua/enqueue-flow.lua +107 -0
- package/dist/lua/enqueue.lua +154 -0
- package/dist/lua/get-active-count.lua +6 -0
- package/dist/lua/get-active-jobs.lua +6 -0
- package/dist/lua/get-delayed-count.lua +5 -0
- package/dist/lua/get-delayed-jobs.lua +5 -0
- package/dist/lua/get-unique-groups-count.lua +13 -0
- package/dist/lua/get-unique-groups.lua +15 -0
- package/dist/lua/get-waiting-count.lua +11 -0
- package/dist/lua/get-waiting-jobs.lua +15 -0
- package/dist/lua/heartbeat.lua +22 -0
- package/dist/lua/is-empty.lua +35 -0
- package/dist/lua/promote-delayed-jobs.lua +40 -0
- package/dist/lua/promote-delayed-one.lua +44 -0
- package/dist/lua/promote-staged.lua +70 -0
- package/dist/lua/record-job-result.lua +143 -0
- package/dist/lua/remove.lua +55 -0
- package/dist/lua/reserve-atomic.lua +114 -0
- package/dist/lua/reserve-batch.lua +141 -0
- package/dist/lua/reserve.lua +161 -0
- package/dist/lua/retry.lua +53 -0
- package/package.json +92 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
-- Check for stalled jobs and move them back to waiting or fail them
|
|
2
|
+
-- KEYS: namespace, currentTime, gracePeriod, maxStalledCount
|
|
3
|
+
-- Returns: array of [jobId, groupId, action] for each stalled job found
|
|
4
|
+
-- action: "recovered" or "failed"
|
|
5
|
+
|
|
6
|
+
local ns = KEYS[1]
|
|
7
|
+
local now = tonumber(ARGV[1])
|
|
8
|
+
local gracePeriod = tonumber(ARGV[2]) or 0
|
|
9
|
+
local maxStalledCount = tonumber(ARGV[3]) or 1
|
|
10
|
+
|
|
11
|
+
-- Circuit breaker for high concurrency: limit stalled job recovery
|
|
12
|
+
local circuitBreakerKey = ns .. ":stalled:circuit"
|
|
13
|
+
local lastCheck = redis.call("GET", circuitBreakerKey)
|
|
14
|
+
if lastCheck then
|
|
15
|
+
local lastCheckTime = tonumber(lastCheck)
|
|
16
|
+
local circuitBreakerInterval = 2000
|
|
17
|
+
if lastCheckTime and (now - lastCheckTime) < circuitBreakerInterval then
|
|
18
|
+
return {}
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
redis.call("SET", circuitBreakerKey, now, "PX", 3000)
|
|
22
|
+
|
|
23
|
+
local processingKey = ns .. ":processing"
|
|
24
|
+
local groupsKey = ns .. ":groups"
|
|
25
|
+
|
|
26
|
+
-- Candidates: jobs whose deadlines are past
|
|
27
|
+
local candidates = redis.call("ZRANGEBYSCORE", processingKey, 0, now - gracePeriod, "LIMIT", 0, 100)
|
|
28
|
+
if not candidates or #candidates == 0 then
|
|
29
|
+
return {}
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
local results = {}
|
|
33
|
+
|
|
34
|
+
for _, jobId in ipairs(candidates) do
|
|
35
|
+
local jobKey = ns .. ":job:" .. jobId
|
|
36
|
+
local h = redis.call("HMGET", jobKey, "groupId","stalledCount","maxAttempts","attempts","status","finishedOn","score")
|
|
37
|
+
local groupId = h[1]
|
|
38
|
+
if groupId then
|
|
39
|
+
local stalledCount = tonumber(h[2]) or 0
|
|
40
|
+
local maxAttempts = tonumber(h[3]) or 3
|
|
41
|
+
local status = h[5]
|
|
42
|
+
local finishedOn = tonumber(h[6] or "0")
|
|
43
|
+
-- CRITICAL: Don't recover jobs that are completing (prevents race with completion)
|
|
44
|
+
-- "completing" is a temporary state set by complete-with-metadata.lua to prevent races
|
|
45
|
+
if status == "processing" then
|
|
46
|
+
stalledCount = stalledCount + 1
|
|
47
|
+
redis.call("HSET", jobKey, "stalledCount", stalledCount)
|
|
48
|
+
-- BullMQ-style: Remove from per-group active list
|
|
49
|
+
local groupActiveKey = ns .. ":g:" .. groupId .. ":active"
|
|
50
|
+
redis.call("LREM", groupActiveKey, 1, jobId)
|
|
51
|
+
|
|
52
|
+
if stalledCount >= maxStalledCount and maxStalledCount > 0 then
|
|
53
|
+
redis.call("ZREM", processingKey, jobId)
|
|
54
|
+
local groupKey = ns .. ":g:" .. groupId
|
|
55
|
+
redis.call("ZREM", groupKey, jobId)
|
|
56
|
+
redis.call("DEL", ns .. ":processing:" .. jobId)
|
|
57
|
+
redis.call("HSET", jobKey, "status","failed","finishedOn", now,
|
|
58
|
+
"failedReason", "Job stalled " .. stalledCount .. " times (max: " .. maxStalledCount .. ")")
|
|
59
|
+
redis.call("ZADD", ns .. ":failed", now, jobId)
|
|
60
|
+
table.insert(results, jobId); table.insert(results, groupId); table.insert(results, "failed")
|
|
61
|
+
else
|
|
62
|
+
local stillInProcessing = redis.call("ZSCORE", processingKey, jobId)
|
|
63
|
+
if stillInProcessing then
|
|
64
|
+
redis.call("ZREM", processingKey, jobId)
|
|
65
|
+
redis.call("DEL", ns .. ":processing:" .. jobId)
|
|
66
|
+
local score = tonumber(h[7])
|
|
67
|
+
if score then
|
|
68
|
+
local groupKey2 = ns .. ":g:" .. groupId
|
|
69
|
+
redis.call("ZADD", groupKey2, score, jobId)
|
|
70
|
+
local head = redis.call("ZRANGE", groupKey2, 0, 0, "WITHSCORES")
|
|
71
|
+
if head and #head >= 2 then
|
|
72
|
+
local headScore = tonumber(head[2])
|
|
73
|
+
redis.call("ZADD", ns .. ":ready", headScore, groupId)
|
|
74
|
+
end
|
|
75
|
+
redis.call("SADD", groupsKey, groupId)
|
|
76
|
+
end
|
|
77
|
+
redis.call("HSET", jobKey, "status", "waiting")
|
|
78
|
+
table.insert(results, jobId); table.insert(results, groupId); table.insert(results, "recovered")
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
return results
|
|
86
|
+
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
-- argv: ns, status, graceAtMs, limit
|
|
2
|
+
local ns = KEYS[1]
|
|
3
|
+
local status = ARGV[1]
|
|
4
|
+
local graceAt = tonumber(ARGV[2]) or 0
|
|
5
|
+
local limit = tonumber(ARGV[3]) or 1000
|
|
6
|
+
|
|
7
|
+
local setKey = nil
|
|
8
|
+
if status == 'completed' then
|
|
9
|
+
setKey = ns .. ':completed'
|
|
10
|
+
elseif status == 'failed' then
|
|
11
|
+
setKey = ns .. ':failed'
|
|
12
|
+
elseif status == 'delayed' then
|
|
13
|
+
setKey = ns .. ':delayed'
|
|
14
|
+
else
|
|
15
|
+
-- unsupported status for clean
|
|
16
|
+
return 0
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
-- Fetch up to 'limit' job ids with score <= graceAt
|
|
20
|
+
local ids = redis.call('ZRANGEBYSCORE', setKey, '-inf', graceAt, 'LIMIT', 0, limit)
|
|
21
|
+
|
|
22
|
+
local removed = 0
|
|
23
|
+
for i = 1, #ids do
|
|
24
|
+
local id = ids[i]
|
|
25
|
+
local jobKey = ns .. ':job:' .. id
|
|
26
|
+
|
|
27
|
+
-- Remove from the primary set first to avoid reprocessing
|
|
28
|
+
redis.call('ZREM', setKey, id)
|
|
29
|
+
|
|
30
|
+
-- Remove from group and update ready queue for ALL statuses
|
|
31
|
+
-- This prevents poisoned groups when completed/failed jobs are cleaned
|
|
32
|
+
local groupId = redis.call('HGET', jobKey, 'groupId')
|
|
33
|
+
if groupId then
|
|
34
|
+
local gZ = ns .. ':g:' .. groupId
|
|
35
|
+
local readyKey = ns .. ':ready'
|
|
36
|
+
redis.call('ZREM', gZ, id)
|
|
37
|
+
local jobCount = redis.call('ZCARD', gZ)
|
|
38
|
+
if jobCount == 0 then
|
|
39
|
+
redis.call('ZREM', readyKey, groupId)
|
|
40
|
+
-- Clean up empty group
|
|
41
|
+
redis.call('DEL', gZ)
|
|
42
|
+
redis.call('SREM', ns .. ':groups', groupId)
|
|
43
|
+
elseif status == 'delayed' then
|
|
44
|
+
-- Only update ready queue score for delayed jobs
|
|
45
|
+
-- (completed/failed jobs shouldn't affect ready queue)
|
|
46
|
+
local head = redis.call('ZRANGE', gZ, 0, 0, 'WITHSCORES')
|
|
47
|
+
if head and #head >= 2 then
|
|
48
|
+
local headScore = tonumber(head[2])
|
|
49
|
+
redis.call('ZADD', readyKey, headScore, groupId)
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
-- Delete job hash, idempotence key and flow results
|
|
55
|
+
redis.call('DEL', jobKey)
|
|
56
|
+
redis.call('DEL', ns .. ':unique:' .. id)
|
|
57
|
+
redis.call('DEL', ns .. ':flow:results:' .. id)
|
|
58
|
+
|
|
59
|
+
removed = removed + 1
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
return removed
|
|
63
|
+
|
|
64
|
+
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
-- argv: ns, groupId, now
|
|
2
|
+
local ns = KEYS[1]
|
|
3
|
+
local groupId = ARGV[1]
|
|
4
|
+
local now = tonumber(ARGV[2])
|
|
5
|
+
|
|
6
|
+
local readyKey = ns .. ":ready"
|
|
7
|
+
local gZ = ns .. ":g:" .. groupId
|
|
8
|
+
local lockKey = ns .. ":lock:" .. groupId
|
|
9
|
+
|
|
10
|
+
-- Check if group has any jobs at all
|
|
11
|
+
local jobCount = redis.call("ZCARD", gZ)
|
|
12
|
+
if jobCount == 0 then
|
|
13
|
+
redis.call("ZREM", readyKey, groupId)
|
|
14
|
+
return "empty"
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
-- Check if group is currently locked by another worker
|
|
18
|
+
local lockValue = redis.call("GET", lockKey)
|
|
19
|
+
if lockValue then
|
|
20
|
+
local lockTtl = redis.call("PTTL", lockKey)
|
|
21
|
+
if lockTtl > 0 then
|
|
22
|
+
return "locked"
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
-- Check if all jobs in the group have exceeded max attempts
|
|
27
|
+
local jobs = redis.call("ZRANGE", gZ, 0, -1)
|
|
28
|
+
local reservableJobs = 0
|
|
29
|
+
for i = 1, #jobs do
|
|
30
|
+
local jobId = jobs[i]
|
|
31
|
+
local jobKey = ns .. ":job:" .. jobId
|
|
32
|
+
local attempts = tonumber(redis.call("HGET", jobKey, "attempts"))
|
|
33
|
+
local maxAttempts = tonumber(redis.call("HGET", jobKey, "maxAttempts"))
|
|
34
|
+
if attempts and maxAttempts and attempts < maxAttempts then
|
|
35
|
+
reservableJobs = reservableJobs + 1
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
if reservableJobs == 0 then
|
|
40
|
+
redis.call("ZREM", readyKey, groupId)
|
|
41
|
+
return "poisoned"
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
return "ok"
|
|
45
|
+
|
|
46
|
+
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
-- argv: ns, nowEpochMs
|
|
2
|
+
local ns = KEYS[1]
|
|
3
|
+
local now = tonumber(ARGV[1])
|
|
4
|
+
|
|
5
|
+
local readyKey = ns .. ":ready"
|
|
6
|
+
local processingKey = ns .. ":processing"
|
|
7
|
+
local cleaned = 0
|
|
8
|
+
|
|
9
|
+
local expiredJobs = redis.call("ZRANGEBYSCORE", processingKey, 0, now)
|
|
10
|
+
for _, jobId in ipairs(expiredJobs) do
|
|
11
|
+
-- CRITICAL: Verify job is STILL in processing to avoid race conditions
|
|
12
|
+
-- If job was completed between our snapshot and now, don't re-add it
|
|
13
|
+
local stillInProcessing = redis.call("ZSCORE", processingKey, jobId)
|
|
14
|
+
|
|
15
|
+
if stillInProcessing then
|
|
16
|
+
local procKey = ns .. ":processing:" .. jobId
|
|
17
|
+
local procData = redis.call("HMGET", procKey, "groupId", "deadlineAt")
|
|
18
|
+
local gid = procData[1]
|
|
19
|
+
local deadlineAt = tonumber(procData[2])
|
|
20
|
+
if gid and deadlineAt and now > deadlineAt then
|
|
21
|
+
local jobKey = ns .. ":job:" .. jobId
|
|
22
|
+
local jobScore = redis.call("HGET", jobKey, "score")
|
|
23
|
+
if jobScore then
|
|
24
|
+
local gZ = ns .. ":g:" .. gid
|
|
25
|
+
redis.call("ZADD", gZ, tonumber(jobScore), jobId)
|
|
26
|
+
local head = redis.call("ZRANGE", gZ, 0, 0, "WITHSCORES")
|
|
27
|
+
if head and #head >= 2 then
|
|
28
|
+
local headScore = tonumber(head[2])
|
|
29
|
+
redis.call("ZADD", readyKey, headScore, gid)
|
|
30
|
+
end
|
|
31
|
+
redis.call("DEL", ns .. ":lock:" .. gid)
|
|
32
|
+
redis.call("DEL", procKey)
|
|
33
|
+
redis.call("ZREM", processingKey, jobId)
|
|
34
|
+
|
|
35
|
+
-- No counter operations - use ZCARD for counts
|
|
36
|
+
|
|
37
|
+
cleaned = cleaned + 1
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
-- If not still in processing, it was completed - don't re-add it!
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
return cleaned
|
|
45
|
+
|
|
46
|
+
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
-- Complete a job with metadata and atomically reserve the next job from the same group
|
|
2
|
+
-- argv: ns, completedJobId, groupId, status, timestamp, resultOrError, keepCompleted, keepFailed,
|
|
3
|
+
-- processedOn, finishedOn, attempts, maxAttempts, now, vt
|
|
4
|
+
local ns = KEYS[1]
|
|
5
|
+
local completedJobId = ARGV[1]
|
|
6
|
+
local gid = ARGV[2]
|
|
7
|
+
local status = ARGV[3]
|
|
8
|
+
local timestamp = tonumber(ARGV[4])
|
|
9
|
+
local resultOrError = ARGV[5]
|
|
10
|
+
local keepCompleted = tonumber(ARGV[6])
|
|
11
|
+
local keepFailed = tonumber(ARGV[7])
|
|
12
|
+
local processedOn = ARGV[8]
|
|
13
|
+
local finishedOn = ARGV[9]
|
|
14
|
+
local attempts = ARGV[10]
|
|
15
|
+
local maxAttempts = ARGV[11]
|
|
16
|
+
local now = tonumber(ARGV[12])
|
|
17
|
+
local vt = tonumber(ARGV[13])
|
|
18
|
+
|
|
19
|
+
local jobKey = ns .. ":job:" .. completedJobId
|
|
20
|
+
|
|
21
|
+
-- [PHASE 3 MODIFICATION START: Get parentId before potentially deleting the job]
|
|
22
|
+
local parentId = redis.call("HGET", jobKey, "parentId")
|
|
23
|
+
-- [PHASE 3 MODIFICATION END]
|
|
24
|
+
|
|
25
|
+
-- Part 1: Atomically verify and mark completion (prevent duplicate processing)
|
|
26
|
+
local processingKey = ns .. ":processing"
|
|
27
|
+
|
|
28
|
+
-- CRITICAL: Check both status AND processing set membership atomically
|
|
29
|
+
-- This prevents race with stalled job recovery
|
|
30
|
+
local jobStatus = redis.call("HGET", jobKey, "status")
|
|
31
|
+
local stillInProcessing = redis.call("ZSCORE", processingKey, completedJobId)
|
|
32
|
+
|
|
33
|
+
-- If job is not in "processing" state OR not in processing set, this is late/duplicate
|
|
34
|
+
if jobStatus ~= "processing" or not stillInProcessing then
|
|
35
|
+
return nil
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
-- Atomically mark as completed and remove from processing
|
|
39
|
+
-- This prevents stalled checker from racing with us
|
|
40
|
+
redis.call("HSET", jobKey, "status", "completing") -- Temporary status to block stalled checker
|
|
41
|
+
redis.call("DEL", ns .. ":processing:" .. completedJobId)
|
|
42
|
+
redis.call("ZREM", processingKey, completedJobId)
|
|
43
|
+
|
|
44
|
+
-- Part 3: Record job metadata (completed or failed)
|
|
45
|
+
|
|
46
|
+
if status == "completed" then
|
|
47
|
+
local completedKey = ns .. ":completed"
|
|
48
|
+
|
|
49
|
+
-- CRITICAL: Always set final status first, even if job will be deleted
|
|
50
|
+
-- This ensures any concurrent reads see "completed", not "completing"
|
|
51
|
+
redis.call("HSET", jobKey, "status", "completed")
|
|
52
|
+
|
|
53
|
+
-- [PHASE 3 MODIFICATION START: Update parent flow if exists]
|
|
54
|
+
if parentId then
|
|
55
|
+
local parentKey = ns .. ":job:" .. parentId
|
|
56
|
+
-- 1. Store child result in flow:results hash (CRITICAL: was missing!)
|
|
57
|
+
local flowResultsKey = ns .. ":flow:results:" .. parentId
|
|
58
|
+
redis.call("HSET", flowResultsKey, completedJobId, resultOrError)
|
|
59
|
+
|
|
60
|
+
-- 2. Decrement remaining counter
|
|
61
|
+
local remaining = redis.call("HINCRBY", parentKey, "flowRemaining", -1)
|
|
62
|
+
|
|
63
|
+
-- 3. If all children done, move parent to waiting
|
|
64
|
+
if remaining <= 0 then
|
|
65
|
+
local parentStatus = redis.call("HGET", parentKey, "status")
|
|
66
|
+
if parentStatus == "waiting-children" then
|
|
67
|
+
redis.call("HSET", parentKey, "status", "waiting")
|
|
68
|
+
local parentGroupId = redis.call("HGET", parentKey, "groupId")
|
|
69
|
+
local parentScore = tonumber(redis.call("HGET", parentKey, "score"))
|
|
70
|
+
if not parentScore then
|
|
71
|
+
parentScore = tonumber(now)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
local pGZ = ns .. ":g:" .. parentGroupId
|
|
75
|
+
redis.call("ZADD", pGZ, parentScore, parentId)
|
|
76
|
+
redis.call("SADD", ns .. ":groups", parentGroupId)
|
|
77
|
+
|
|
78
|
+
-- Check if should add to ready queue (if head)
|
|
79
|
+
local pHead = redis.call("ZRANGE", pGZ, 0, 0, "WITHSCORES")
|
|
80
|
+
if pHead and #pHead >= 2 then
|
|
81
|
+
local pHeadScore = tonumber(pHead[2])
|
|
82
|
+
redis.call("ZADD", ns .. ":ready", pHeadScore, parentGroupId)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
-- [PHASE 3 MODIFICATION END]
|
|
88
|
+
|
|
89
|
+
if keepCompleted > 0 then
|
|
90
|
+
-- Store full job metadata and add to completed set
|
|
91
|
+
redis.call("HSET", jobKey,
|
|
92
|
+
"processedOn", processedOn,
|
|
93
|
+
"finishedOn", finishedOn,
|
|
94
|
+
"attempts", attempts,
|
|
95
|
+
"maxAttempts", maxAttempts,
|
|
96
|
+
"returnvalue", resultOrError
|
|
97
|
+
)
|
|
98
|
+
redis.call("ZADD", completedKey, timestamp, completedJobId)
|
|
99
|
+
|
|
100
|
+
-- Trim old entries atomically
|
|
101
|
+
local zcount = redis.call("ZCARD", completedKey)
|
|
102
|
+
local toRemove = zcount - keepCompleted
|
|
103
|
+
if toRemove > 0 then
|
|
104
|
+
local oldIds = redis.call("ZRANGE", completedKey, 0, toRemove - 1)
|
|
105
|
+
if #oldIds > 0 then
|
|
106
|
+
redis.call("ZREMRANGEBYRANK", completedKey, 0, toRemove - 1)
|
|
107
|
+
for i = 1, #oldIds do
|
|
108
|
+
local oldId = oldIds[i]
|
|
109
|
+
redis.call("DEL", ns .. ":job:" .. oldId)
|
|
110
|
+
redis.call("DEL", ns .. ":unique:" .. oldId)
|
|
111
|
+
redis.call("DEL", ns .. ":flow:results:" .. oldId)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
else
|
|
116
|
+
-- keepCompleted == 0: Delete immediately (status already set above)
|
|
117
|
+
redis.call("DEL", jobKey)
|
|
118
|
+
redis.call("DEL", ns .. ":unique:" .. completedJobId)
|
|
119
|
+
redis.call("DEL", ns .. ":flow:results:" .. completedJobId)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
elseif status == "failed" then
|
|
123
|
+
local failedKey = ns .. ":failed"
|
|
124
|
+
local errorInfo = cjson.decode(resultOrError)
|
|
125
|
+
|
|
126
|
+
-- CRITICAL: Always set final status first, even if job will be deleted
|
|
127
|
+
redis.call("HSET", jobKey, "status", "failed")
|
|
128
|
+
|
|
129
|
+
if keepFailed > 0 then
|
|
130
|
+
redis.call("HSET", jobKey,
|
|
131
|
+
"failedReason", errorInfo.message or "Error",
|
|
132
|
+
"failedName", errorInfo.name or "Error",
|
|
133
|
+
"stacktrace", errorInfo.stack or "",
|
|
134
|
+
"processedOn", processedOn,
|
|
135
|
+
"finishedOn", finishedOn,
|
|
136
|
+
"attempts", attempts,
|
|
137
|
+
"maxAttempts", maxAttempts
|
|
138
|
+
)
|
|
139
|
+
redis.call("ZADD", failedKey, timestamp, completedJobId)
|
|
140
|
+
else
|
|
141
|
+
-- Delete job (status already set above)
|
|
142
|
+
redis.call("DEL", jobKey)
|
|
143
|
+
redis.call("DEL", ns .. ":unique:" .. completedJobId)
|
|
144
|
+
redis.call("DEL", ns .. ":flow:results:" .. completedJobId)
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
-- Part 3: Handle group active list and get next job (BullMQ-style)
|
|
149
|
+
local groupActiveKey = ns .. ":g:" .. gid .. ":active"
|
|
150
|
+
local activeJobId = redis.call("LINDEX", groupActiveKey, 0)
|
|
151
|
+
|
|
152
|
+
-- Always clean up this job from active list, even if not at head
|
|
153
|
+
-- This prevents stale active lists from race conditions
|
|
154
|
+
if activeJobId == completedJobId then
|
|
155
|
+
-- Normal case: this job is at the head of active list
|
|
156
|
+
redis.call("LPOP", groupActiveKey)
|
|
157
|
+
else
|
|
158
|
+
-- Race condition: job is not at head (maybe already removed, or wrong job)
|
|
159
|
+
-- Clean it up anyway to prevent stale entries
|
|
160
|
+
redis.call("LREM", groupActiveKey, 1, completedJobId)
|
|
161
|
+
|
|
162
|
+
-- If active list had a different job or was empty, don't try to reserve next
|
|
163
|
+
-- Return nil to indicate no chaining
|
|
164
|
+
return nil
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
local gZ = ns .. ":g:" .. gid
|
|
168
|
+
local zpop = redis.call("ZPOPMIN", gZ, 1)
|
|
169
|
+
if not zpop or #zpop == 0 then
|
|
170
|
+
-- Clean up empty group
|
|
171
|
+
local jobCount = redis.call("ZCARD", gZ)
|
|
172
|
+
if jobCount == 0 then
|
|
173
|
+
redis.call("DEL", gZ)
|
|
174
|
+
redis.call("SREM", ns .. ":groups", gid)
|
|
175
|
+
redis.call("ZREM", ns .. ":ready", gid)
|
|
176
|
+
end
|
|
177
|
+
-- No next job
|
|
178
|
+
return nil
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
local nextJobId = zpop[1]
|
|
182
|
+
local nextJobKey = ns .. ":job:" .. nextJobId
|
|
183
|
+
local job = redis.call("HMGET", nextJobKey, "id","groupId","data","attempts","maxAttempts","seq","timestamp","orderMs","score")
|
|
184
|
+
local id, groupId, payload, attempts, maxAttempts, seq, enq, orderMs, score = job[1], job[2], job[3], job[4], job[5], job[6], job[7], job[8], job[9]
|
|
185
|
+
|
|
186
|
+
-- Validate job data exists (handle corrupted/missing job hash)
|
|
187
|
+
if not id or id == false then
|
|
188
|
+
-- Job hash is missing/corrupted, clean up and return completion only
|
|
189
|
+
-- Re-add next job to ready queue if exists
|
|
190
|
+
local nextHead = redis.call("ZRANGE", gZ, 0, 0, "WITHSCORES")
|
|
191
|
+
if nextHead and #nextHead >= 2 then
|
|
192
|
+
local nextScore = tonumber(nextHead[2])
|
|
193
|
+
local readyKey = ns .. ":ready"
|
|
194
|
+
redis.call("ZADD", readyKey, nextScore, groupId)
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
-- Return nil to indicate no next job was reserved
|
|
198
|
+
return nil
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
-- Push next job to active list (chaining)
|
|
202
|
+
redis.call("LPUSH", groupActiveKey, id)
|
|
203
|
+
|
|
204
|
+
local procKey = ns .. ":processing:" .. id
|
|
205
|
+
local deadline = now + vt
|
|
206
|
+
redis.call("HSET", procKey, "groupId", groupId, "deadlineAt", tostring(deadline))
|
|
207
|
+
|
|
208
|
+
local processingKey = ns .. ":processing"
|
|
209
|
+
redis.call("ZADD", processingKey, deadline, id)
|
|
210
|
+
|
|
211
|
+
-- Mark next job as processing for accurate stalled detection
|
|
212
|
+
redis.call("HSET", nextJobKey, "status", "processing")
|
|
213
|
+
|
|
214
|
+
local nextHead = redis.call("ZRANGE", gZ, 0, 0, "WITHSCORES")
|
|
215
|
+
if nextHead and #nextHead >= 2 then
|
|
216
|
+
local nextScore = tonumber(nextHead[2])
|
|
217
|
+
local readyKey = ns .. ":ready"
|
|
218
|
+
redis.call("ZADD", readyKey, nextScore, groupId)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
return id .. "|||" .. groupId .. "|||" .. payload .. "|||" .. attempts .. "|||" .. maxAttempts .. "|||" .. seq .. "|||" .. enq .. "|||" .. orderMs .. "|||" .. score .. "|||" .. deadline
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
-- Complete a job: unlock group AND record metadata atomically in one call
|
|
2
|
+
-- argv: ns, jobId, groupId, status, timestamp, resultOrError, keepCompleted, keepFailed,
|
|
3
|
+
-- processedOn, finishedOn, attempts, maxAttempts
|
|
4
|
+
local ns = KEYS[1]
|
|
5
|
+
local jobId = ARGV[1]
|
|
6
|
+
local gid = ARGV[2]
|
|
7
|
+
local status = ARGV[3]
|
|
8
|
+
local timestamp = tonumber(ARGV[4])
|
|
9
|
+
local resultOrError = ARGV[5]
|
|
10
|
+
local keepCompleted = tonumber(ARGV[6])
|
|
11
|
+
local keepFailed = tonumber(ARGV[7])
|
|
12
|
+
local processedOn = ARGV[8]
|
|
13
|
+
local finishedOn = ARGV[9]
|
|
14
|
+
local attempts = ARGV[10]
|
|
15
|
+
local maxAttempts = ARGV[11]
|
|
16
|
+
|
|
17
|
+
local jobKey = ns .. ":job:" .. jobId
|
|
18
|
+
|
|
19
|
+
-- [PHASE 3 MODIFICATION START: Get parentId before potentially deleting the job]
|
|
20
|
+
local parentId = redis.call("HGET", jobKey, "parentId")
|
|
21
|
+
-- [PHASE 3 MODIFICATION END]
|
|
22
|
+
|
|
23
|
+
-- Part 1: Atomically verify and mark completion (prevent duplicate processing)
|
|
24
|
+
local processingKey = ns .. ":processing"
|
|
25
|
+
|
|
26
|
+
-- CRITICAL: Check both status AND processing set membership atomically
|
|
27
|
+
-- This prevents race with stalled job recovery
|
|
28
|
+
local jobStatus = redis.call("HGET", jobKey, "status")
|
|
29
|
+
local stillInProcessing = redis.call("ZSCORE", processingKey, jobId)
|
|
30
|
+
|
|
31
|
+
-- If job is not in "processing" state OR not in processing set, this is late/duplicate
|
|
32
|
+
if jobStatus ~= "processing" or not stillInProcessing then
|
|
33
|
+
-- Job was already handled (recovered, failed, or completed by another worker)
|
|
34
|
+
-- Return 0 to indicate this completion was ignored
|
|
35
|
+
return 0
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
-- Atomically mark as completed and remove from processing
|
|
39
|
+
-- This prevents stalled checker from racing with us
|
|
40
|
+
redis.call("HSET", jobKey, "status", "completing") -- Temporary status to block stalled checker
|
|
41
|
+
redis.call("DEL", ns .. ":processing:" .. jobId)
|
|
42
|
+
redis.call("ZREM", processingKey, jobId)
|
|
43
|
+
|
|
44
|
+
-- Always remove this job from active list to prevent stale entries
|
|
45
|
+
local groupActiveKey = ns .. ":g:" .. gid .. ":active"
|
|
46
|
+
local activeJobId = redis.call("LINDEX", groupActiveKey, 0)
|
|
47
|
+
local wasActive = (activeJobId == jobId)
|
|
48
|
+
|
|
49
|
+
if wasActive then
|
|
50
|
+
-- Normal case: remove from head of active list
|
|
51
|
+
redis.call("LPOP", groupActiveKey)
|
|
52
|
+
else
|
|
53
|
+
-- Race condition: not at head, but still remove to prevent stale entries
|
|
54
|
+
redis.call("LREM", groupActiveKey, 1, jobId)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
-- Check if there are more jobs in this group
|
|
58
|
+
local gZ = ns .. ":g:" .. gid
|
|
59
|
+
local jobCount = redis.call("ZCARD", gZ)
|
|
60
|
+
if jobCount == 0 then
|
|
61
|
+
-- Remove empty group
|
|
62
|
+
redis.call("DEL", gZ)
|
|
63
|
+
redis.call("DEL", groupActiveKey)
|
|
64
|
+
redis.call("SREM", ns .. ":groups", gid)
|
|
65
|
+
redis.call("ZREM", ns .. ":ready", gid)
|
|
66
|
+
redis.call("DEL", ns .. ":buffer:" .. gid)
|
|
67
|
+
redis.call("ZREM", ns .. ":buffering", gid)
|
|
68
|
+
else
|
|
69
|
+
-- Group has more jobs, re-add to ready if not buffering
|
|
70
|
+
local groupBufferKey = ns .. ":buffer:" .. gid
|
|
71
|
+
local isBuffering = redis.call("EXISTS", groupBufferKey)
|
|
72
|
+
|
|
73
|
+
if isBuffering == 0 then
|
|
74
|
+
local nextHead = redis.call("ZRANGE", gZ, 0, 0, "WITHSCORES")
|
|
75
|
+
if nextHead and #nextHead >= 2 then
|
|
76
|
+
local nextScore = tonumber(nextHead[2])
|
|
77
|
+
local readyKey = ns .. ":ready"
|
|
78
|
+
redis.call("ZADD", readyKey, nextScore, gid)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
-- [PHASE 3 MODIFICATION START: Update Flow Parent]
|
|
84
|
+
if parentId then
|
|
85
|
+
local parentKey = ns .. ":job:" .. parentId
|
|
86
|
+
-- 1. Store child result in a separate hash to define parent's "childrenValues"
|
|
87
|
+
-- Key: flow:results:{parentId}, Field: {childId}
|
|
88
|
+
local flowResultsKey = ns .. ":flow:results:" .. parentId
|
|
89
|
+
redis.call("HSET", flowResultsKey, jobId, resultOrError)
|
|
90
|
+
|
|
91
|
+
-- 2. Decrement remaining counter
|
|
92
|
+
local remaining = redis.call("HINCRBY", parentKey, "flowRemaining", -1)
|
|
93
|
+
|
|
94
|
+
-- 3. If all children done, move parent to waiting
|
|
95
|
+
if remaining <= 0 then
|
|
96
|
+
local parentStatus = redis.call("HGET", parentKey, "status")
|
|
97
|
+
if parentStatus == "waiting-children" then
|
|
98
|
+
redis.call("HSET", parentKey, "status", "waiting")
|
|
99
|
+
|
|
100
|
+
-- Add parent to its group and ready queue
|
|
101
|
+
local parentGroupId = redis.call("HGET", parentKey, "groupId")
|
|
102
|
+
local parentScore = tonumber(redis.call("HGET", parentKey, "score"))
|
|
103
|
+
if not parentScore then
|
|
104
|
+
parentScore = tonumber(redis.call("TIME")[1]) * 1000
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
local pGZ = ns .. ":g:" .. parentGroupId
|
|
108
|
+
redis.call("ZADD", pGZ, parentScore, parentId)
|
|
109
|
+
redis.call("SADD", ns .. ":groups", parentGroupId)
|
|
110
|
+
|
|
111
|
+
-- Check if should add to ready queue (if head)
|
|
112
|
+
local pHead = redis.call("ZRANGE", pGZ, 0, 0, "WITHSCORES")
|
|
113
|
+
if pHead and #pHead >= 2 then
|
|
114
|
+
local pHeadScore = tonumber(pHead[2])
|
|
115
|
+
redis.call("ZADD", ns .. ":ready", pHeadScore, parentGroupId)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
-- [PHASE 3 MODIFICATION END]
|
|
121
|
+
|
|
122
|
+
-- Part 2: Record job metadata (completed or failed)
|
|
123
|
+
if status == "completed" then
|
|
124
|
+
local completedKey = ns .. ":completed"
|
|
125
|
+
|
|
126
|
+
-- CRITICAL: Always set final status first, even if job will be deleted
|
|
127
|
+
-- This ensures any concurrent reads see "completed", not "completing"
|
|
128
|
+
redis.call("HSET", jobKey, "status", "completed")
|
|
129
|
+
|
|
130
|
+
if keepCompleted > 0 then
|
|
131
|
+
-- Store full job metadata and add to completed set
|
|
132
|
+
redis.call("HSET", jobKey,
|
|
133
|
+
"processedOn", processedOn,
|
|
134
|
+
"finishedOn", finishedOn,
|
|
135
|
+
"attempts", attempts,
|
|
136
|
+
"maxAttempts", maxAttempts,
|
|
137
|
+
"returnvalue", resultOrError
|
|
138
|
+
)
|
|
139
|
+
redis.call("ZADD", completedKey, timestamp, jobId)
|
|
140
|
+
|
|
141
|
+
-- Trim old entries atomically
|
|
142
|
+
local zcount = redis.call("ZCARD", completedKey)
|
|
143
|
+
local toRemove = zcount - keepCompleted
|
|
144
|
+
if toRemove > 0 then
|
|
145
|
+
local oldIds = redis.call("ZRANGE", completedKey, 0, toRemove - 1)
|
|
146
|
+
if #oldIds > 0 then
|
|
147
|
+
redis.call("ZREMRANGEBYRANK", completedKey, 0, toRemove - 1)
|
|
148
|
+
for i = 1, #oldIds do
|
|
149
|
+
local oldId = oldIds[i]
|
|
150
|
+
redis.call("DEL", ns .. ":job:" .. oldId)
|
|
151
|
+
redis.call("DEL", ns .. ":unique:" .. oldId)
|
|
152
|
+
redis.call("DEL", ns .. ":flow:results:" .. oldId)
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
else
|
|
157
|
+
-- keepCompleted == 0: Delete immediately (status already set above)
|
|
158
|
+
redis.call("DEL", jobKey)
|
|
159
|
+
redis.call("DEL", ns .. ":unique:" .. jobId)
|
|
160
|
+
redis.call("DEL", ns .. ":flow:results:" .. jobId)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
elseif status == "failed" then
|
|
164
|
+
local failedKey = ns .. ":failed"
|
|
165
|
+
local errorInfo = cjson.decode(resultOrError)
|
|
166
|
+
|
|
167
|
+
-- CRITICAL: Always set final status first, even if job will be deleted
|
|
168
|
+
redis.call("HSET", jobKey, "status", "failed")
|
|
169
|
+
|
|
170
|
+
if keepFailed > 0 then
|
|
171
|
+
redis.call("HSET", jobKey,
|
|
172
|
+
"failedReason", errorInfo.message or "Error",
|
|
173
|
+
"failedName", errorInfo.name or "Error",
|
|
174
|
+
"stacktrace", errorInfo.stack or "",
|
|
175
|
+
"processedOn", processedOn,
|
|
176
|
+
"finishedOn", finishedOn,
|
|
177
|
+
"attempts", attempts,
|
|
178
|
+
"maxAttempts", maxAttempts
|
|
179
|
+
)
|
|
180
|
+
redis.call("ZADD", failedKey, timestamp, jobId)
|
|
181
|
+
else
|
|
182
|
+
-- Delete job (status already set above)
|
|
183
|
+
redis.call("DEL", jobKey)
|
|
184
|
+
redis.call("DEL", ns .. ":unique:" .. jobId)
|
|
185
|
+
redis.call("DEL", ns .. ":flow:results:" .. jobId)
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
return 1
|
|
190
|
+
|