screwdriver-buildcluster-queue-worker 5.0.2 → 5.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -0
- package/WORKFLOW.md +166 -0
- package/config/custom-environment-variables.yaml +2 -0
- package/config/default.yaml +2 -0
- package/lib/config.js +4 -2
- package/lib/helper.js +9 -4
- package/package.json +2 -2
- package/receiver.js +55 -3
package/README.md
CHANGED
|
@@ -9,6 +9,18 @@
|
|
|
9
9
|
npm install screwdriver-buildcluster-queue-worker
|
|
10
10
|
```
|
|
11
11
|
|
|
12
|
+
## Build Start Workflow
|
|
13
|
+
|
|
14
|
+
The queue worker processes build start messages from RabbitMQ and manages pod lifecycle in Kubernetes.
|
|
15
|
+
|
|
16
|
+
> **See [WORKFLOW.md](WORKFLOW.md) for detailed workflow diagram with retry behavior**
|
|
17
|
+
|
|
18
|
+
### Configuration
|
|
19
|
+
|
|
20
|
+
- `prefetchCount`: 20 messages per worker (default)
|
|
21
|
+
- `buildInitTimeout`: 5 minutes (default)
|
|
22
|
+
- `messageReprocessLimit`: 5 retries in retry queue (default)
|
|
23
|
+
|
|
12
24
|
## Testing
|
|
13
25
|
|
|
14
26
|
```bash
|
package/WORKFLOW.md
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# Build Start Workflow - Detailed Flow
|
|
2
|
+
|
|
3
|
+
## Main Queue Processing
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
7
|
+
│ MAIN QUEUE: Message Processing │
|
|
8
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
9
|
+
|
|
10
|
+
┌──────────────────┐
|
|
11
|
+
│ Receive Message │
|
|
12
|
+
│ (prefetch=20) │
|
|
13
|
+
└────────┬─────────┘
|
|
14
|
+
│
|
|
15
|
+
├─────────────────────────────────────────────────────────┐
|
|
16
|
+
│ │
|
|
17
|
+
┌────────▼─────────┐ ┌─────────▼─────────┐
|
|
18
|
+
│ Start Timeout │ │ Spawn Thread │
|
|
19
|
+
│ (5 min timer) │ │ Call _start() │
|
|
20
|
+
└──────────────────┘ └─────────┬─────────┘
|
|
21
|
+
│ │
|
|
22
|
+
│ ┌──────────▼──────────┐
|
|
23
|
+
│ │ Try Create K8s Pod │
|
|
24
|
+
│ │ (POST to K8s API) │
|
|
25
|
+
│ └──────────┬──────────┘
|
|
26
|
+
│ │
|
|
27
|
+
│ ┌────────────────────┼───────────────┐
|
|
28
|
+
│ │ │ │
|
|
29
|
+
│ ┌──────────▼─────────┐ ┌──────▼─────────────────────┐
|
|
30
|
+
│ │ Success (201) │ │ API Error (500/503/etc) │
|
|
31
|
+
│ │ Pod Created! │ │ Network error, K8s down │
|
|
32
|
+
│ └──────────┬─────────┘ └──────┬─────────────────────┘
|
|
33
|
+
│ │ │
|
|
34
|
+
│ ┌──────────▼──────────────┐ ┌─▼──────────────────────┐
|
|
35
|
+
│ │ Check Pod Status │ │ THROW EXCEPTION │
|
|
36
|
+
│ │ (GET pod/status) │ │ "Failed to create pod" │
|
|
37
|
+
│ └──────────┬──────────────┘ └─┬──────────────────────┘
|
|
38
|
+
│ │ │
|
|
39
|
+
│ ┌─────────────┼─────────────┐ │ .on('error')
|
|
40
|
+
│ │ │ │ │
|
|
41
|
+
│ ┌──────────▼─────┐ ┌───▼───┐ ┌─────▼──────┐▼──────────────────┐
|
|
42
|
+
│ │ Pod Status: │ │ Pod: │ │ Pod Status:││ Retry < 5? │
|
|
43
|
+
│ │ pending/running│ │failed │ │ unknown ││ YES: NACK (retry)│
|
|
44
|
+
│ └──────────┬─────┘ └───┬───┘ └─────┬──────┘│ NO: FAILURE+ACK │
|
|
45
|
+
│ │ │ │ └──────────────────┘
|
|
46
|
+
│ ┌──────────▼─────┐ ┌───▼─────────────▼───┐
|
|
47
|
+
│ │ Return TRUE │ │ Return FALSE │
|
|
48
|
+
│ │ "Pod OK" │ │ "Status check failed"│
|
|
49
|
+
│ └──────────┬─────┘ └───┬──────────────────┘
|
|
50
|
+
│ │ │
|
|
51
|
+
│ ┌──────────▼─────┐ ┌───▼──────────────┐
|
|
52
|
+
│ │ ACK message │ │ Clear timeout │
|
|
53
|
+
│ │ (free prefetch)│ │ ACK message │
|
|
54
|
+
│ └──────────┬─────┘ │ Push to RETRY │
|
|
55
|
+
│ │ │ QUEUE (verify) │
|
|
56
|
+
│ ┌──────────▼─────┐ └───┬──────────────┘
|
|
57
|
+
│ │ DON'T clear │ │
|
|
58
|
+
│ │ timeout! │ │
|
|
59
|
+
│ │ (keep monitor) │ │
|
|
60
|
+
│ └──────────┬─────┘ │
|
|
61
|
+
│ │ │
|
|
62
|
+
│◄─────────────────────┘ │
|
|
63
|
+
│ │
|
|
64
|
+
┌────────▼─────────┐ │
|
|
65
|
+
│ Wait 5 minutes │ │
|
|
66
|
+
└────────┬─────────┘ │
|
|
67
|
+
│ │
|
|
68
|
+
┌────────▼───────────────────────┐ │
|
|
69
|
+
│ Timeout Fires! │ │
|
|
70
|
+
│ Update build statusmessage: │ │
|
|
71
|
+
│ "Build initialization delayed" │ │
|
|
72
|
+
└────────┬───────────────────────┘ │
|
|
73
|
+
│ │
|
|
74
|
+
┌────────▼─────────┐ │
|
|
75
|
+
│ Push to │◄───────────────────────┘
|
|
76
|
+
│ RETRY QUEUE │
|
|
77
|
+
└────────┬─────────┘
|
|
78
|
+
│
|
|
79
|
+
│
|
|
80
|
+
┌────────────▼─────────────────────────────────────────────────────────────────┐
|
|
81
|
+
│ RETRY QUEUE: Pod Verification │
|
|
82
|
+
└──────────────────────────────────────────────────────────────────────────────┘
|
|
83
|
+
|
|
84
|
+
┌────────────────────┐
|
|
85
|
+
│ Receive Message │
|
|
86
|
+
│ from Retry Queue │
|
|
87
|
+
└─────────┬──────────┘
|
|
88
|
+
│
|
|
89
|
+
┌─────────▼──────────┐
|
|
90
|
+
│ Spawn Thread │
|
|
91
|
+
│ Call _verify() │
|
|
92
|
+
└─────────┬──────────┘
|
|
93
|
+
│
|
|
94
|
+
┌─────────▼────────────────┐
|
|
95
|
+
│ Try Get Pod Status │
|
|
96
|
+
│ (GET pods?labelSelector) │
|
|
97
|
+
└─────────┬────────────────┘
|
|
98
|
+
│
|
|
99
|
+
┌─────────┼────────────────────────────┐
|
|
100
|
+
│ │ │
|
|
101
|
+
┌───▼─────────────┐ ┌─────────▼────────────────┐
|
|
102
|
+
│ Success │ │ API Error (K8s API down) │
|
|
103
|
+
│ Got pod status │ │ Network issue │
|
|
104
|
+
└───┬─────────────┘ └─────────┬────────────────┘
|
|
105
|
+
│ │
|
|
106
|
+
│ ┌─────────▼────────────────┐
|
|
107
|
+
│ │ THROW EXCEPTION │
|
|
108
|
+
│ │ .on('error') │
|
|
109
|
+
│ └─────────┬────────────────┘
|
|
110
|
+
│ │
|
|
111
|
+
│ ┌─────────▼────────────────┐
|
|
112
|
+
│ │ Retry < 5? │
|
|
113
|
+
│ │ YES: NACK (retry verify) │
|
|
114
|
+
│ │ NO: FAILURE + ACK │
|
|
115
|
+
│ └──────────────────────────┘
|
|
116
|
+
│
|
|
117
|
+
▼
|
|
118
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
119
|
+
│ Check Pod Status & Container Waiting Reason │
|
|
120
|
+
└─────────┬────────────────────────────────────────────────────────┘
|
|
121
|
+
│
|
|
122
|
+
┌─────┴──────────┬────────────────┬───────────────┬─────────────────┐
|
|
123
|
+
│ │ │ │ │
|
|
124
|
+
┌───▼────────────┐ ┌▼──────────┐ ┌─▼────────────┐ ┌▼───────────────┐ ┌▼──────────────┐
|
|
125
|
+
│ Pod Status: │ │ Pod: │ │ Pod: │ │ Pod: │ │ Pod: │
|
|
126
|
+
│ running/ │ │ failed/ │ │ pending + │ │ pending + │ │ pending + │
|
|
127
|
+
│ succeeded │ │ unknown │ │ ErrImagePull │ │ CrashLoopBack │ │ PodInitializing│
|
|
128
|
+
└───┬────────────┘ └┬──────────┘ └─┬────────────┘ └┬───────────────┘ └┬──────────────┘
|
|
129
|
+
│ │ │ │ │
|
|
130
|
+
┌───▼────────────┐ ┌▼────────────────────────────────▼──────────────────▼──────────────┐
|
|
131
|
+
│ Return EMPTY │ │ Return ERROR MESSAGE │
|
|
132
|
+
│ (success) │ │ "Build failed to start..." │
|
|
133
|
+
└───┬────────────┘ └┬───────────────────────────────────────────────────────────────────┘
|
|
134
|
+
│ │ │
|
|
135
|
+
┌───▼────────────┐ ┌▼────────────────┐ ┌─────────▼──────────┐
|
|
136
|
+
│ ACK message │ │ Update build to │ │ Return EMPTY │
|
|
137
|
+
│ (build OK) │ │ FAILURE │ │ (allow more time │
|
|
138
|
+
└────────────────┘ │ ACK message │ │ for image pull) │
|
|
139
|
+
└─────────────────┘ └─────────┬──────────┘
|
|
140
|
+
│
|
|
141
|
+
┌─────────▼──────────┐
|
|
142
|
+
│ ACK message │
|
|
143
|
+
│ (pod still healthy │
|
|
144
|
+
│ may take 10+ min) │
|
|
145
|
+
└────────────────────┘
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Key Points
|
|
149
|
+
|
|
150
|
+
### Main Queue Retries (NACK):
|
|
151
|
+
- **When**: Pod creation throws exception (K8s API error, network issue)
|
|
152
|
+
- **Why**: Pod was never created, safe to retry
|
|
153
|
+
- **How many**: Up to 5 times via RabbitMQ requeue
|
|
154
|
+
- **After max retries**: Update build to FAILURE and ACK
|
|
155
|
+
|
|
156
|
+
### Retry Queue Retries (NACK):
|
|
157
|
+
- **When**: _verify() throws exception (can't get pod status from K8s)
|
|
158
|
+
- **Why**: Transient API issue, pod might be fine
|
|
159
|
+
- **How many**: Up to 5 times via RabbitMQ requeue
|
|
160
|
+
- **After max retries**: Update build to FAILURE and ACK
|
|
161
|
+
|
|
162
|
+
### No Retries (ACK immediately):
|
|
163
|
+
- Pod created successfully (pending/running status) → main queue
|
|
164
|
+
- Pod status check failed (pod exists but failed/unknown) → main queue → retry queue
|
|
165
|
+
- Verify detects failed pod (returns error message) → retry queue
|
|
166
|
+
- Verify detects healthy pod (returns empty) → retry queue
|
|
@@ -363,6 +363,8 @@ rabbitmq:
|
|
|
363
363
|
retryQueueEnabled: RABBITMQ_RETRYQUEUE_ENABLED
|
|
364
364
|
# Exchange / router name for rabbitmq
|
|
365
365
|
exchange: RABBITMQ_EXCHANGE
|
|
366
|
+
# build pod initialization timeout
|
|
367
|
+
initTimeout: RABBITMQ_BUILD_INIT_TIMEOUT
|
|
366
368
|
httpd:
|
|
367
369
|
# Port to listen on
|
|
368
370
|
port: PORT
|
package/config/default.yaml
CHANGED
package/lib/config.js
CHANGED
|
@@ -19,7 +19,8 @@ const {
|
|
|
19
19
|
messageReprocessLimit,
|
|
20
20
|
retryQueue,
|
|
21
21
|
retryQueueEnabled,
|
|
22
|
-
exchange
|
|
22
|
+
exchange,
|
|
23
|
+
initTimeout
|
|
23
24
|
} = rabbitmqConfig;
|
|
24
25
|
const amqpURI = `${protocol}://${username}:${password}@${host}:${port}${vhost}`;
|
|
25
26
|
|
|
@@ -58,7 +59,8 @@ function getConfig() {
|
|
|
58
59
|
cachePath: path,
|
|
59
60
|
retryQueue,
|
|
60
61
|
retryQueueEnabled: convertToBool(retryQueueEnabled),
|
|
61
|
-
exchange
|
|
62
|
+
exchange,
|
|
63
|
+
initTimeout: Number(initTimeout) || 5
|
|
62
64
|
};
|
|
63
65
|
}
|
|
64
66
|
|
package/lib/helper.js
CHANGED
|
@@ -12,14 +12,19 @@ const request = require('screwdriver-request');
|
|
|
12
12
|
*/
|
|
13
13
|
async function updateBuildStatusAsync(config, status, statusMessage) {
|
|
14
14
|
const { buildId } = config;
|
|
15
|
+
const payload = {};
|
|
16
|
+
|
|
17
|
+
if (status) {
|
|
18
|
+
payload.status = status;
|
|
19
|
+
}
|
|
20
|
+
if (statusMessage) {
|
|
21
|
+
payload.statusMessage = statusMessage;
|
|
22
|
+
}
|
|
15
23
|
|
|
16
24
|
return request({
|
|
17
25
|
method: 'PUT',
|
|
18
26
|
url: `${config.apiUri}/v4/builds/${buildId}`,
|
|
19
|
-
json:
|
|
20
|
-
status,
|
|
21
|
-
statusMessage
|
|
22
|
-
},
|
|
27
|
+
json: payload,
|
|
23
28
|
context: {
|
|
24
29
|
token: config.token
|
|
25
30
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "screwdriver-buildcluster-queue-worker",
|
|
3
|
-
"version": "5.0
|
|
3
|
+
"version": "5.2.0",
|
|
4
4
|
"description": "An amqp connection manager implementation that consumes jobs from Rabbitmq queue.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -49,7 +49,7 @@
|
|
|
49
49
|
"fs": "0.0.2",
|
|
50
50
|
"fs-extra": "^11.1.0",
|
|
51
51
|
"path": "^0.12.7",
|
|
52
|
-
"screwdriver-executor-k8s": "^17.1
|
|
52
|
+
"screwdriver-executor-k8s": "^17.3.1",
|
|
53
53
|
"screwdriver-executor-k8s-vm": "^5.0.0",
|
|
54
54
|
"screwdriver-executor-router": "^5.0.0",
|
|
55
55
|
"screwdriver-logger": "^3.0.0",
|
package/receiver.js
CHANGED
|
@@ -16,11 +16,13 @@ const {
|
|
|
16
16
|
cacheStrategy,
|
|
17
17
|
cachePath,
|
|
18
18
|
retryQueue,
|
|
19
|
-
retryQueueEnabled
|
|
19
|
+
retryQueueEnabled,
|
|
20
|
+
initTimeout
|
|
20
21
|
} = config.getConfig();
|
|
21
22
|
const { spawn } = threads;
|
|
22
23
|
const CACHE_STRATEGY_DISK = 'disk';
|
|
23
24
|
let channelWrapper;
|
|
25
|
+
const INIT_TIMEOUT = initTimeout * 60 * 1000; // milliseconds
|
|
24
26
|
|
|
25
27
|
/**
|
|
26
28
|
* onMessage consume messages in batches, once its available in the queue. channelWrapper has in-built back pressure
|
|
@@ -103,18 +105,68 @@ const onMessage = data => {
|
|
|
103
105
|
}
|
|
104
106
|
}
|
|
105
107
|
|
|
108
|
+
let timeoutWarningLogged = false;
|
|
109
|
+
let timeoutTimer = null;
|
|
110
|
+
|
|
111
|
+
if (jobType === 'start') {
|
|
112
|
+
timeoutTimer = setTimeout(async () => {
|
|
113
|
+
if (!timeoutWarningLogged) {
|
|
114
|
+
timeoutWarningLogged = true;
|
|
115
|
+
const timeoutMessage = `Build initialization timeout exceeded (${initTimeout}min) for ${job}`;
|
|
116
|
+
|
|
117
|
+
logger.error(timeoutMessage);
|
|
118
|
+
|
|
119
|
+
// Update build statusmessage only to show delayed initialization
|
|
120
|
+
try {
|
|
121
|
+
await helper.updateBuildStatusAsync(
|
|
122
|
+
buildConfig,
|
|
123
|
+
undefined,
|
|
124
|
+
'Build initialization delayed - pod creation taking longer than expected'
|
|
125
|
+
);
|
|
126
|
+
logger.info(`Build status updated with delay warning for build ${buildId}`);
|
|
127
|
+
} catch (err) {
|
|
128
|
+
logger.error(
|
|
129
|
+
`Failed to update build status with delay warning for build:${buildId}:${err}`
|
|
130
|
+
);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Push to retry queue for verification and potential failure
|
|
134
|
+
// This allows verify to check pod status and fail if still pending
|
|
135
|
+
logger.info(`Pushing ${job} to retry queue for verification after timeout`);
|
|
136
|
+
retryQueueLib.push(buildConfig, buildId);
|
|
137
|
+
}
|
|
138
|
+
}, INIT_TIMEOUT);
|
|
139
|
+
}
|
|
140
|
+
|
|
106
141
|
thread
|
|
107
142
|
.send([jobType, buildConfig, job])
|
|
108
143
|
.on('message', successful => {
|
|
109
144
|
logger.info(`acknowledge, job completed for ${job}, result: ${successful}`);
|
|
145
|
+
|
|
110
146
|
if (!successful && jobType === 'start') {
|
|
111
|
-
//
|
|
147
|
+
// Pod failed immediately (status check returned false)
|
|
148
|
+
// Clear timeout and push to retry queue for immediate verification
|
|
149
|
+
if (timeoutTimer) {
|
|
150
|
+
clearTimeout(timeoutTimer);
|
|
151
|
+
}
|
|
112
152
|
retryQueueLib.push(buildConfig, buildId);
|
|
153
|
+
} else if (successful && jobType === 'start') {
|
|
154
|
+
// Pod created successfully - DON'T clear timeout
|
|
155
|
+
// Let the timeout fire to verify pod eventually started
|
|
156
|
+
// This handles pods that get stuck in pending after creation
|
|
157
|
+
logger.info(`Timeout remains active for ${job}, will verify after ${initTimeout}min`);
|
|
158
|
+
} else if (timeoutTimer) {
|
|
159
|
+
// For non-start jobs (stop, verify), or other cases, clear timeout normally
|
|
160
|
+
clearTimeout(timeoutTimer);
|
|
113
161
|
}
|
|
162
|
+
|
|
114
163
|
channelWrapper.ack(data);
|
|
115
164
|
thread.kill();
|
|
116
165
|
})
|
|
117
166
|
.on('error', async error => {
|
|
167
|
+
if (timeoutTimer) {
|
|
168
|
+
clearTimeout(timeoutTimer);
|
|
169
|
+
}
|
|
118
170
|
thread.kill();
|
|
119
171
|
if (['403', '404'].includes(error.message.substring(0, 3))) {
|
|
120
172
|
channelWrapper.ack(data);
|
|
@@ -236,7 +288,7 @@ const listen = async () => {
|
|
|
236
288
|
const queueFn = [channel.checkQueue(queue), channel.prefetch(prefetchCount), channel.consume(queue, onMessage)];
|
|
237
289
|
|
|
238
290
|
if (retryQueueEnabled) {
|
|
239
|
-
queueFn.
|
|
291
|
+
queueFn.push(channel.checkQueue(retryQueue), channel.consume(retryQueue, onRetryMessage));
|
|
240
292
|
}
|
|
241
293
|
|
|
242
294
|
return Promise.all(queueFn);
|