screwdriver-queue-service 5.0.3 → 6.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/default.yaml +2 -2
- package/docs/ARCHITECTURE_REDESIGN.md +214 -0
- package/docs/QS-REDIS-ATOMIC-REDESIGN.png +0 -0
- package/package.json +2 -1
- package/plugins/queue/scheduler.js +2 -8
- package/plugins/worker/lib/BlockedBy.js +144 -330
- package/plugins/worker/lib/LuaScriptLoader.js +232 -0
- package/plugins/worker/lib/jobs.js +74 -26
- package/plugins/worker/lib/lua/checkTimeout.lua +166 -0
- package/plugins/worker/lib/lua/lib/CollapseDecider.lua +155 -0
- package/plugins/worker/lib/lua/lib/DependencyResolver.lua +109 -0
- package/plugins/worker/lib/lua/lib/StateValidator.lua +179 -0
- package/plugins/worker/lib/lua/lib/TimeoutDecider.lua +161 -0
- package/plugins/worker/lib/lua/startBuild.lua +217 -0
- package/plugins/worker/lib/lua/stopBuild.lua +133 -0
- package/plugins/worker/lib/timeout.js +123 -68
- package/plugins/worker/worker.js +10 -10
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
--[[
|
|
2
|
+
stopBuild.lua - Atomic build stop/cleanup
|
|
3
|
+
|
|
4
|
+
This script atomically cleans up all Redis state for a stopped build.
|
|
5
|
+
|
|
6
|
+
ARGV[1] = buildId (string)
|
|
7
|
+
ARGV[2] = jobId (string)
|
|
8
|
+
ARGV[3] = queuePrefix (string, e.g., "resque:")
|
|
9
|
+
ARGV[4] = runningJobsPrefix (string, e.g., "running_job_")
|
|
10
|
+
ARGV[5] = waitingJobsPrefix (string, e.g., "waiting_job_")
|
|
11
|
+
|
|
12
|
+
Returns: JSON string with cleanup result
|
|
13
|
+
{
|
|
14
|
+
action: "CLEANED" | "NOT_RUNNING" | "PARTIAL",
|
|
15
|
+
buildId: string,
|
|
16
|
+
jobId: string,
|
|
17
|
+
keysDeleted: {
|
|
18
|
+
buildConfig: boolean,
|
|
19
|
+
runningKey: boolean,
|
|
20
|
+
lastRunningKey: boolean,
|
|
21
|
+
waitingKey: boolean,
|
|
22
|
+
timeoutConfig: boolean,
|
|
23
|
+
deleteKey: boolean
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
]]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
-- Parse arguments
|
|
30
|
+
local buildId = ARGV[1]
|
|
31
|
+
local jobId = ARGV[2]
|
|
32
|
+
local queuePrefix = ARGV[3]
|
|
33
|
+
local runningJobsPrefix = ARGV[4]
|
|
34
|
+
local waitingJobsPrefix = ARGV[5]
|
|
35
|
+
|
|
36
|
+
-- Convert buildId to number for comparison
|
|
37
|
+
local buildIdNum = tonumber(buildId)
|
|
38
|
+
|
|
39
|
+
-- Build Redis keys
|
|
40
|
+
local buildConfigKey = queuePrefix .. "buildConfigs"
|
|
41
|
+
local timeoutConfigKey = queuePrefix .. "timeoutConfigs"
|
|
42
|
+
local runningKey = runningJobsPrefix .. jobId
|
|
43
|
+
local lastRunningKey = "last_" .. runningJobsPrefix .. jobId
|
|
44
|
+
local waitingKey = waitingJobsPrefix .. jobId
|
|
45
|
+
local deleteKey = "deleted_" .. jobId .. "_" .. buildId
|
|
46
|
+
|
|
47
|
+
-- Read current state to validate ownership
|
|
48
|
+
|
|
49
|
+
local currentRunningBuildId = redis.call("GET", runningKey)
|
|
50
|
+
local lastRunningBuildId = redis.call("GET", lastRunningKey)
|
|
51
|
+
local buildConfigExists = redis.call("HEXISTS", buildConfigKey, buildId)
|
|
52
|
+
|
|
53
|
+
-- Track what gets deleted
|
|
54
|
+
local keysDeleted = {
|
|
55
|
+
buildConfig = false,
|
|
56
|
+
runningKey = false,
|
|
57
|
+
lastRunningKey = false,
|
|
58
|
+
waitingKey = false,
|
|
59
|
+
timeoutConfig = false,
|
|
60
|
+
deleteKey = false
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
-- Determine what to clean up
|
|
64
|
+
|
|
65
|
+
-- Convert Redis values to numbers for comparison
|
|
66
|
+
local currentRunningId = currentRunningBuildId and tonumber(currentRunningBuildId) or nil
|
|
67
|
+
local lastRunningId = lastRunningBuildId and tonumber(lastRunningBuildId) or nil
|
|
68
|
+
|
|
69
|
+
-- Determine if this build "owns" the running keys
|
|
70
|
+
local ownsRunningKey = (currentRunningId == buildIdNum)
|
|
71
|
+
local ownsLastRunningKey = (lastRunningId == buildIdNum)
|
|
72
|
+
|
|
73
|
+
-- Atomically clean up all keys
|
|
74
|
+
|
|
75
|
+
-- Always delete buildConfig (unconditionally)
|
|
76
|
+
if buildConfigExists == 1 then
|
|
77
|
+
redis.call("HDEL", buildConfigKey, buildId)
|
|
78
|
+
keysDeleted.buildConfig = true
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
-- Delete runningKey only if it matches this buildId
|
|
82
|
+
if ownsRunningKey then
|
|
83
|
+
redis.call("DEL", runningKey)
|
|
84
|
+
keysDeleted.runningKey = true
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
-- Delete lastRunningKey only if it matches this buildId
|
|
88
|
+
if ownsLastRunningKey then
|
|
89
|
+
redis.call("DEL", lastRunningKey)
|
|
90
|
+
keysDeleted.lastRunningKey = true
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
-- Remove from waiting queue (returns count removed)
|
|
94
|
+
local waitingRemoved = redis.call("LREM", waitingKey, 0, buildId)
|
|
95
|
+
if waitingRemoved > 0 then
|
|
96
|
+
keysDeleted.waitingKey = true
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
-- Delete timeout config (unconditionally)
|
|
100
|
+
local timeoutDeleted = redis.call("HDEL", timeoutConfigKey, buildId)
|
|
101
|
+
if timeoutDeleted > 0 then
|
|
102
|
+
keysDeleted.timeoutConfig = true
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
-- Delete the deleteKey if it exists
|
|
106
|
+
local deleteKeyExists = redis.call("EXISTS", deleteKey)
|
|
107
|
+
if deleteKeyExists == 1 then
|
|
108
|
+
redis.call("DEL", deleteKey)
|
|
109
|
+
keysDeleted.deleteKey = true
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
-- Report what was cleaned up
|
|
113
|
+
|
|
114
|
+
-- Determine action based on what was cleaned
|
|
115
|
+
local action = "CLEANED"
|
|
116
|
+
|
|
117
|
+
if not ownsRunningKey and currentRunningBuildId then
|
|
118
|
+
-- Different build is running now
|
|
119
|
+
action = "NOT_RUNNING"
|
|
120
|
+
elseif not keysDeleted.buildConfig and not keysDeleted.runningKey and not keysDeleted.lastRunningKey then
|
|
121
|
+
-- build already cleaned up
|
|
122
|
+
action = "ALREADY_CLEAN"
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
return cjson.encode({
|
|
126
|
+
action = action,
|
|
127
|
+
buildId = buildId,
|
|
128
|
+
jobId = jobId,
|
|
129
|
+
keysDeleted = keysDeleted,
|
|
130
|
+
currentRunningBuildId = currentRunningBuildId,
|
|
131
|
+
ownsRunningKey = ownsRunningKey,
|
|
132
|
+
ownsLastRunningKey = ownsLastRunningKey
|
|
133
|
+
})
|
|
@@ -3,90 +3,149 @@
|
|
|
3
3
|
const logger = require('screwdriver-logger');
|
|
4
4
|
const helper = require('../../helper');
|
|
5
5
|
const { waitingJobsPrefix, runningJobsPrefix, queuePrefix } = require('../../../config/redis');
|
|
6
|
+
|
|
7
|
+
let luaScriptLoader;
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Get LuaScriptLoader instance
|
|
11
|
+
* @return {LuaScriptLoader} Lua script loader instance
|
|
12
|
+
*/
|
|
13
|
+
function getLuaScriptLoader() {
|
|
14
|
+
if (!luaScriptLoader) {
|
|
15
|
+
// eslint-disable-next-line global-require
|
|
16
|
+
const worker = require('../worker');
|
|
17
|
+
|
|
18
|
+
luaScriptLoader = worker.luaScriptLoader;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
return luaScriptLoader;
|
|
22
|
+
}
|
|
23
|
+
|
|
6
24
|
const TIMEOUT_CODE = 3;
|
|
7
|
-
const TIMEOUT_BUFFER = 1;
|
|
8
25
|
const DEFAULT_TIMEOUT = 90;
|
|
9
26
|
const hash = `${queuePrefix}timeoutConfigs`;
|
|
10
|
-
const REDIS_LOCK_TTL = 10000; // in ms
|
|
11
27
|
|
|
12
28
|
/**
|
|
13
|
-
*
|
|
14
|
-
* @
|
|
15
|
-
* @param {Object} timeoutConfig
|
|
29
|
+
* Execute timeout cleanup
|
|
30
|
+
* @param {Object} decision
|
|
16
31
|
* @param {String} buildId
|
|
17
32
|
* @param {Object} redis
|
|
18
33
|
* @param {String} workerId
|
|
19
34
|
* @return {Promise}
|
|
20
35
|
*/
|
|
21
|
-
async function
|
|
36
|
+
async function executeTimeout(decision, buildId, redis, workerId) {
|
|
37
|
+
const { timeoutMinutes } = decision;
|
|
38
|
+
|
|
39
|
+
// Get and update current step
|
|
40
|
+
let step;
|
|
41
|
+
|
|
22
42
|
try {
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
const { startTime } = timeoutConfig;
|
|
43
|
+
step = await helper.getCurrentStep({
|
|
44
|
+
redisInstance: redis,
|
|
45
|
+
buildId
|
|
46
|
+
});
|
|
47
|
+
} catch (err) {
|
|
48
|
+
logger.error(`worker[${workerId}] -> No active step found for ${buildId}`);
|
|
49
|
+
}
|
|
31
50
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
51
|
+
if (step) {
|
|
52
|
+
await helper.updateStepStop({
|
|
53
|
+
redisInstance: redis,
|
|
54
|
+
buildId,
|
|
55
|
+
stepName: step.name,
|
|
56
|
+
code: TIMEOUT_CODE
|
|
57
|
+
});
|
|
58
|
+
}
|
|
35
59
|
|
|
36
|
-
|
|
37
|
-
|
|
60
|
+
await helper.updateBuildStatus({
|
|
61
|
+
redisInstance: redis,
|
|
62
|
+
buildId,
|
|
63
|
+
status: 'FAILURE',
|
|
64
|
+
statusMessage: `Build failed due to timeout (${timeoutMinutes} minutes)`
|
|
65
|
+
});
|
|
38
66
|
|
|
39
|
-
|
|
40
|
-
|
|
67
|
+
logger.info(`worker[${workerId}] -> Timeout cleanup completed for build ${buildId}`);
|
|
68
|
+
}
|
|
41
69
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
70
|
+
/**
|
|
71
|
+
* Handle the decision from Lua script
|
|
72
|
+
* @param {Object} decision - {action, reason, buildId, ...}
|
|
73
|
+
* @param {String} buildId - Build ID
|
|
74
|
+
* @param {Object} redis - Redis instance
|
|
75
|
+
* @param {String} workerId - Worker ID
|
|
76
|
+
* @return {Promise}
|
|
77
|
+
*/
|
|
78
|
+
async function handleDecision(decision, buildId, redis, workerId) {
|
|
79
|
+
switch (decision.action) {
|
|
80
|
+
case 'TIMEOUT':
|
|
81
|
+
// Build has timed out - execute cleanup
|
|
82
|
+
logger.info(`worker[${workerId}] -> Build has timed out ${buildId}`);
|
|
83
|
+
await executeTimeout(decision, buildId, redis, workerId);
|
|
84
|
+
break;
|
|
45
85
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
} catch (err) {
|
|
52
|
-
logger.error(`worker[${workerId}] -> No active step found for ${buildId}`);
|
|
53
|
-
}
|
|
86
|
+
case 'CLEANUP':
|
|
87
|
+
// Build already completed/cleaned up - just remove from timeout configs
|
|
88
|
+
logger.info(`worker[${workerId}] -> Build ${buildId} ${decision.reason}, cleaning up`);
|
|
89
|
+
await redis.hdel(hash, buildId);
|
|
90
|
+
break;
|
|
54
91
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
buildId,
|
|
59
|
-
stepName: step.name,
|
|
60
|
-
code: TIMEOUT_CODE
|
|
61
|
-
});
|
|
62
|
-
}
|
|
92
|
+
case 'SKIP':
|
|
93
|
+
// Build still within timeout - do nothing
|
|
94
|
+
break;
|
|
63
95
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
statusMessage: 'Build failed due to timeout'
|
|
69
|
-
});
|
|
96
|
+
default:
|
|
97
|
+
logger.error(`worker[${workerId}] -> Unknown timeout action: ${decision.action}`);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
70
100
|
|
|
71
|
-
|
|
101
|
+
/**
|
|
102
|
+
* @method process
|
|
103
|
+
* @param {Object} timeoutConfig
|
|
104
|
+
* @param {String} buildId
|
|
105
|
+
* @param {Object} redis
|
|
106
|
+
* @param {String} workerId
|
|
107
|
+
* @return {Promise}
|
|
108
|
+
*/
|
|
109
|
+
async function process(timeoutConfig, buildId, redis, workerId) {
|
|
110
|
+
const { jobId, startTime, timeout } = timeoutConfig;
|
|
72
111
|
|
|
73
|
-
|
|
112
|
+
logger.info(`worker[${workerId}] -> Checking timeout for build ${buildId} (job ${jobId})`);
|
|
74
113
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
114
|
+
if (!startTime) {
|
|
115
|
+
logger.warn(`worker[${workerId}] -> startTime not set for buildId: ${buildId}`);
|
|
116
|
+
await redis.hdel(hash, buildId);
|
|
78
117
|
|
|
79
|
-
|
|
80
|
-
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
81
120
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
121
|
+
try {
|
|
122
|
+
const loader = getLuaScriptLoader();
|
|
123
|
+
const result = await loader.executeScript(
|
|
124
|
+
'checkTimeout.lua',
|
|
125
|
+
[],
|
|
126
|
+
[
|
|
127
|
+
String(buildId),
|
|
128
|
+
String(jobId),
|
|
129
|
+
String(startTime),
|
|
130
|
+
String(timeout || DEFAULT_TIMEOUT),
|
|
131
|
+
String(Date.now()),
|
|
132
|
+
queuePrefix,
|
|
133
|
+
runningJobsPrefix,
|
|
134
|
+
waitingJobsPrefix
|
|
135
|
+
]
|
|
136
|
+
);
|
|
137
|
+
|
|
138
|
+
const decision = JSON.parse(result);
|
|
139
|
+
|
|
140
|
+
logger.info(`worker[${workerId}] -> Build ${buildId}: action=${decision.action}, reason=${decision.reason}`);
|
|
141
|
+
|
|
142
|
+
await handleDecision(decision, buildId, redis, workerId);
|
|
85
143
|
} catch (err) {
|
|
86
|
-
|
|
87
|
-
|
|
144
|
+
logger.error(`Error in timeout check for build ${buildId}: ${err.message}`);
|
|
145
|
+
logger.error(err.stack);
|
|
88
146
|
|
|
89
|
-
|
|
147
|
+
// Delete key from redis in case of error to prevent reprocessing
|
|
148
|
+
await redis.hdel(hash, buildId);
|
|
90
149
|
}
|
|
91
150
|
}
|
|
92
151
|
|
|
@@ -95,11 +154,10 @@ async function process(timeoutConfig, buildId, redis, workerId) {
|
|
|
95
154
|
* If yes, abort build.
|
|
96
155
|
* @method check
|
|
97
156
|
* @param {Object} redis
|
|
98
|
-
* @param {Object} redlock
|
|
99
157
|
* @param {String} workerId
|
|
100
158
|
* @return {Promise}
|
|
101
159
|
*/
|
|
102
|
-
async function check(redis,
|
|
160
|
+
async function check(redis, workerId) {
|
|
103
161
|
const keys = await redis.hkeys(hash);
|
|
104
162
|
|
|
105
163
|
if (!keys || keys.length === 0) return;
|
|
@@ -107,7 +165,6 @@ async function check(redis, redlock, workerId) {
|
|
|
107
165
|
await Promise.all(
|
|
108
166
|
keys.map(async buildId => {
|
|
109
167
|
try {
|
|
110
|
-
const lock = await redlock.lock(buildId, REDIS_LOCK_TTL);
|
|
111
168
|
const json = await redis.hget(hash, buildId);
|
|
112
169
|
|
|
113
170
|
if (!json) return;
|
|
@@ -119,10 +176,8 @@ async function check(redis, redlock, workerId) {
|
|
|
119
176
|
}
|
|
120
177
|
|
|
121
178
|
await process(timeoutConfig, buildId, redis, workerId);
|
|
122
|
-
|
|
123
|
-
await lock.unlock();
|
|
124
179
|
} catch (err) {
|
|
125
|
-
logger.error(`worker[${workerId}] ->
|
|
180
|
+
logger.error(`worker[${workerId}] -> Error checking timeout for build ${buildId}: ${err.message}`);
|
|
126
181
|
}
|
|
127
182
|
})
|
|
128
183
|
);
|
|
@@ -135,13 +190,13 @@ async function check(redis, redlock, workerId) {
|
|
|
135
190
|
const workerAccessMap = {};
|
|
136
191
|
|
|
137
192
|
/**
|
|
193
|
+
* Check timeouts with backoff to avoid excessive polling
|
|
138
194
|
*
|
|
139
195
|
* @param {Object} redis
|
|
140
|
-
* @param {Object} redlock
|
|
141
196
|
* @param {Number} workerId
|
|
142
197
|
* @param {Number} [pollInterval]
|
|
143
198
|
*/
|
|
144
|
-
async function checkWithBackOff(redis,
|
|
199
|
+
async function checkWithBackOff(redis, workerId, pollInterval = 60) {
|
|
145
200
|
workerAccessMap[workerId] = workerAccessMap[workerId] || new Date();
|
|
146
201
|
const diffMs = new Date().getTime() - new Date(workerAccessMap[workerId]).getTime();
|
|
147
202
|
const diffSeconds = Math.round(diffMs / 1000);
|
|
@@ -150,7 +205,7 @@ async function checkWithBackOff(redis, redlock, workerId, pollInterval = 60) {
|
|
|
150
205
|
if (diffSeconds >= pollInterval) {
|
|
151
206
|
logger.info('worker[%s] -> Processing timeout checks', workerId);
|
|
152
207
|
workerAccessMap[workerId] = new Date();
|
|
153
|
-
await check(redis,
|
|
208
|
+
await check(redis, workerId);
|
|
154
209
|
}
|
|
155
210
|
}
|
|
156
211
|
|
package/plugins/worker/worker.js
CHANGED
|
@@ -3,21 +3,16 @@
|
|
|
3
3
|
const { MultiWorker, Scheduler } = require('node-resque');
|
|
4
4
|
const config = require('config');
|
|
5
5
|
const logger = require('screwdriver-logger');
|
|
6
|
-
const Redlock = require('redlock');
|
|
7
6
|
const jobs = require('./lib/jobs');
|
|
8
7
|
const timeout = require('./lib/timeout');
|
|
9
8
|
const helper = require('../helper');
|
|
9
|
+
const LuaScriptLoader = require('./lib/LuaScriptLoader');
|
|
10
10
|
const workerConfig = config.get('worker');
|
|
11
11
|
const { queueNamespace, queuePrefix } = require('../../config/redis');
|
|
12
12
|
const redis = require('../redis');
|
|
13
|
-
|
|
14
|
-
const redlock = new Redlock([redis], {
|
|
15
|
-
driftFactor: 0.01, // time in ms
|
|
16
|
-
retryCount: 5,
|
|
17
|
-
retryDelay: 200, // time in ms
|
|
18
|
-
retryJitter: 200 // time in ms
|
|
19
|
-
});
|
|
13
|
+
|
|
20
14
|
const resqueConnection = { redis, namespace: queueNamespace };
|
|
15
|
+
const luaScriptLoader = new LuaScriptLoader(redis);
|
|
21
16
|
|
|
22
17
|
/**
|
|
23
18
|
* Shutdown both worker and scheduler and then exit the process
|
|
@@ -60,6 +55,10 @@ const scheduler = new Scheduler({ connection: resqueConnection });
|
|
|
60
55
|
*/
|
|
61
56
|
async function invoke() {
|
|
62
57
|
try {
|
|
58
|
+
logger.info('Loading Lua scripts...');
|
|
59
|
+
await luaScriptLoader.loadAllScripts();
|
|
60
|
+
logger.info('Lua scripts loaded successfully');
|
|
61
|
+
|
|
63
62
|
/* eslint-disable max-len */
|
|
64
63
|
multiWorker.on('start', workerId => logger.info(`queueWorker->worker[${workerId}] started`));
|
|
65
64
|
multiWorker.on('end', workerId => logger.info(`queueWorker->worker[${workerId}] ended`));
|
|
@@ -69,7 +68,7 @@ async function invoke() {
|
|
|
69
68
|
multiWorker.on('poll', async (workerId, queue) => {
|
|
70
69
|
if (queue === 'builds') {
|
|
71
70
|
logger.info(`queueWorker->worker[${workerId}] polling ${queue}`);
|
|
72
|
-
await timeout.checkWithBackOff(redis,
|
|
71
|
+
await timeout.checkWithBackOff(redis, workerId);
|
|
73
72
|
}
|
|
74
73
|
});
|
|
75
74
|
multiWorker.on('job', (workerId, queue, job) =>
|
|
@@ -165,5 +164,6 @@ module.exports = {
|
|
|
165
164
|
multiWorker,
|
|
166
165
|
scheduler,
|
|
167
166
|
shutDownAll,
|
|
168
|
-
cleanUp
|
|
167
|
+
cleanUp,
|
|
168
|
+
luaScriptLoader
|
|
169
169
|
};
|