screwdriver-buildcluster-queue-worker 5.0.2 → 5.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,6 +9,18 @@
9
9
  npm install screwdriver-buildcluster-queue-worker
10
10
  ```
11
11
 
12
+ ## Build Start Workflow
13
+
14
+ The queue worker processes build start messages from RabbitMQ and manages pod lifecycle in Kubernetes.
15
+
16
+ > **See [WORKFLOW.md](WORKFLOW.md) for detailed workflow diagram with retry behavior**
17
+
18
+ ### Configuration
19
+
20
+ - `prefetchCount`: 20 messages per worker (default)
21
+ - `buildInitTimeout`: 5 minutes (default)
22
+ - `messageReprocessLimit`: 5 retries in retry queue (default)
23
+
12
24
  ## Testing
13
25
 
14
26
  ```bash
package/WORKFLOW.md ADDED
@@ -0,0 +1,166 @@
1
+ # Build Start Workflow - Detailed Flow
2
+
3
+ ## Main Queue Processing
4
+
5
+ ```
6
+ ┌─────────────────────────────────────────────────────────────────────────────┐
7
+ │ MAIN QUEUE: Message Processing │
8
+ └─────────────────────────────────────────────────────────────────────────────┘
9
+
10
+ ┌──────────────────┐
11
+ │ Receive Message │
12
+ │ (prefetch=20) │
13
+ └────────┬─────────┘
14
+
15
+ ├─────────────────────────────────────────────────────────┐
16
+ │ │
17
+ ┌────────▼─────────┐ ┌─────────▼─────────┐
18
+ │ Start Timeout │ │ Spawn Thread │
19
+ │ (5 min timer) │ │ Call _start() │
20
+ └──────────────────┘ └─────────┬─────────┘
21
+ │ │
22
+ │ ┌──────────▼──────────┐
23
+ │ │ Try Create K8s Pod │
24
+ │ │ (POST to K8s API) │
25
+ │ └──────────┬──────────┘
26
+ │ │
27
+ │ ┌────────────────────┼───────────────┐
28
+ │ │ │ │
29
+ │ ┌──────────▼─────────┐ ┌──────▼─────────────────────┐
30
+ │ │ Success (201) │ │ API Error (500/503/etc) │
31
+ │ │ Pod Created! │ │ Network error, K8s down │
32
+ │ └──────────┬─────────┘ └──────┬─────────────────────┘
33
+ │ │ │
34
+ │ ┌──────────▼──────────────┐ ┌─▼──────────────────────┐
35
+ │ │ Check Pod Status │ │ THROW EXCEPTION │
36
+ │ │ (GET pod/status) │ │ "Failed to create pod" │
37
+ │ └──────────┬──────────────┘ └─┬──────────────────────┘
38
+ │ │ │
39
+ │ ┌─────────────┼─────────────┐ │ .on('error')
40
+ │ │ │ │ │
41
+ │ ┌──────────▼─────┐ ┌───▼───┐ ┌─────▼──────┐▼──────────────────┐
42
+ │ │ Pod Status: │ │ Pod: │ │ Pod Status:││ Retry < 5? │
43
+ │ │ pending/running│ │failed │ │ unknown ││ YES: NACK (retry)│
44
+ │ └──────────┬─────┘ └───┬───┘ └─────┬──────┘│ NO: FAILURE+ACK │
45
+ │ │ │ │ └──────────────────┘
46
+ │ ┌──────────▼─────┐ ┌───▼─────────────▼───┐
47
+ │ │ Return TRUE │ │ Return FALSE │
48
+ │ │ "Pod OK" │ │ "Status check failed"│
49
+ │ └──────────┬─────┘ └───┬──────────────────┘
50
+ │ │ │
51
+ │ ┌──────────▼─────┐ ┌───▼──────────────┐
52
+ │ │ ACK message │ │ Clear timeout │
53
+ │ │ (free prefetch)│ │ ACK message │
54
+ │ └──────────┬─────┘ │ Push to RETRY │
55
+ │ │ │ QUEUE (verify) │
56
+ │ ┌──────────▼─────┐ └───┬──────────────┘
57
+ │ │ DON'T clear │ │
58
+ │ │ timeout! │ │
59
+ │ │ (keep monitor) │ │
60
+ │ └──────────┬─────┘ │
61
+ │ │ │
62
+ │◄─────────────────────┘ │
63
+ │ │
64
+ ┌────────▼─────────┐ │
65
+ │ Wait 5 minutes │ │
66
+ └────────┬─────────┘ │
67
+ │ │
68
+ ┌────────▼───────────────────────┐ │
69
+ │ Timeout Fires! │ │
70
+ │ Update build statusmessage: │ │
71
+ │ "Build initialization delayed" │ │
72
+ └────────┬───────────────────────┘ │
73
+ │ │
74
+ ┌────────▼─────────┐ │
75
+ │ Push to │◄───────────────────────┘
76
+ │ RETRY QUEUE │
77
+ └────────┬─────────┘
78
+
79
+
80
+ ┌────────────▼─────────────────────────────────────────────────────────────────┐
81
+ │ RETRY QUEUE: Pod Verification │
82
+ └──────────────────────────────────────────────────────────────────────────────┘
83
+
84
+ ┌────────────────────┐
85
+ │ Receive Message │
86
+ │ from Retry Queue │
87
+ └─────────┬──────────┘
88
+
89
+ ┌─────────▼──────────┐
90
+ │ Spawn Thread │
91
+ │ Call _verify() │
92
+ └─────────┬──────────┘
93
+
94
+ ┌─────────▼────────────────┐
95
+ │ Try Get Pod Status │
96
+ │ (GET pods?labelSelector) │
97
+ └─────────┬────────────────┘
98
+
99
+ ┌─────────┼────────────────────────────┐
100
+ │ │ │
101
+ ┌───▼─────────────┐ ┌─────────▼────────────────┐
102
+ │ Success │ │ API Error (K8s API down) │
103
+ │ Got pod status │ │ Network issue │
104
+ └───┬─────────────┘ └─────────┬────────────────┘
105
+ │ │
106
+ │ ┌─────────▼────────────────┐
107
+ │ │ THROW EXCEPTION │
108
+ │ │ .on('error') │
109
+ │ └─────────┬────────────────┘
110
+ │ │
111
+ │ ┌─────────▼────────────────┐
112
+ │ │ Retry < 5? │
113
+ │ │ YES: NACK (retry verify) │
114
+ │ │ NO: FAILURE + ACK │
115
+ │ └──────────────────────────┘
116
+
117
+
118
+ ┌─────────────────────────────────────────────────────────────────┐
119
+ │ Check Pod Status & Container Waiting Reason │
120
+ └─────────┬────────────────────────────────────────────────────────┘
121
+
122
+ ┌─────┴──────────┬────────────────┬───────────────┬─────────────────┐
123
+ │ │ │ │ │
124
+ ┌───▼────────────┐ ┌▼──────────┐ ┌─▼────────────┐ ┌▼───────────────┐ ┌▼──────────────┐
125
+ │ Pod Status: │ │ Pod: │ │ Pod: │ │ Pod: │ │ Pod: │
126
+ │ running/ │ │ failed/ │ │ pending + │ │ pending + │ │ pending + │
127
+ │ succeeded │ │ unknown │ │ ErrImagePull │ │ CrashLoopBack │ │ PodInitializing│
128
+ └───┬────────────┘ └┬──────────┘ └─┬────────────┘ └┬───────────────┘ └┬──────────────┘
129
+ │ │ │ │ │
130
+ ┌───▼────────────┐ ┌▼────────────────────────────────▼──────────────────▼──────────────┐
131
+ │ Return EMPTY │ │ Return ERROR MESSAGE │
132
+ │ (success) │ │ "Build failed to start..." │
133
+ └───┬────────────┘ └┬───────────────────────────────────────────────────────────────────┘
134
+ │ │ │
135
+ ┌───▼────────────┐ ┌▼────────────────┐ ┌─────────▼──────────┐
136
+ │ ACK message │ │ Update build to │ │ Return EMPTY │
137
+ │ (build OK) │ │ FAILURE │ │ (allow more time │
138
+ └────────────────┘ │ ACK message │ │ for image pull) │
139
+ └─────────────────┘ └─────────┬──────────┘
140
+
141
+ ┌─────────▼──────────┐
142
+ │ ACK message │
143
+ │ (pod still healthy │
144
+ │ may take 10+ min) │
145
+ └────────────────────┘
146
+ ```
147
+
148
+ ## Key Points
149
+
150
+ ### Main Queue Retries (NACK):
151
+ - **When**: Pod creation throws exception (K8s API error, network issue)
152
+ - **Why**: Pod was never created, safe to retry
153
+ - **How many**: Up to 5 times via RabbitMQ requeue
154
+ - **After max retries**: Update build to FAILURE and ACK
155
+
156
+ ### Retry Queue Retries (NACK):
157
+ - **When**: _verify() throws exception (can't get pod status from K8s)
158
+ - **Why**: Transient API issue, pod might be fine
159
+ - **How many**: Up to 5 times via RabbitMQ requeue
160
+ - **After max retries**: Update build to FAILURE and ACK
161
+
162
+ ### No Retries (ACK immediately):
163
+ - Pod created successfully (pending/running status) → main queue
164
+ - Pod status check failed (pod exists but failed/unknown) → main queue → retry queue
165
+ - Verify detects failed pod (returns error message) → retry queue
166
+ - Verify detects healthy pod (returns empty) → retry queue
@@ -363,6 +363,8 @@ rabbitmq:
363
363
  retryQueueEnabled: RABBITMQ_RETRYQUEUE_ENABLED
364
364
  # Exchange / router name for rabbitmq
365
365
  exchange: RABBITMQ_EXCHANGE
366
+ # build pod initialization timeout
367
+ initTimeout: RABBITMQ_BUILD_INIT_TIMEOUT
366
368
  httpd:
367
369
  # Port to listen on
368
370
  port: PORT
@@ -247,6 +247,8 @@ rabbitmq:
247
247
  retryQueueEnabled: false
248
248
  # Exchange / router name for rabbitmq
249
249
  exchange: build
250
+ # build pod initialization timeout in minutes
251
+ initTimeout: "5"
250
252
  httpd:
251
253
  # Port to listen on
252
254
  port: 80
package/lib/config.js CHANGED
@@ -19,7 +19,8 @@ const {
19
19
  messageReprocessLimit,
20
20
  retryQueue,
21
21
  retryQueueEnabled,
22
- exchange
22
+ exchange,
23
+ initTimeout
23
24
  } = rabbitmqConfig;
24
25
  const amqpURI = `${protocol}://${username}:${password}@${host}:${port}${vhost}`;
25
26
 
@@ -58,7 +59,8 @@ function getConfig() {
58
59
  cachePath: path,
59
60
  retryQueue,
60
61
  retryQueueEnabled: convertToBool(retryQueueEnabled),
61
- exchange
62
+ exchange,
63
+ initTimeout: Number(initTimeout) || 5
62
64
  };
63
65
  }
64
66
 
package/lib/helper.js CHANGED
@@ -12,14 +12,19 @@ const request = require('screwdriver-request');
12
12
  */
13
13
  async function updateBuildStatusAsync(config, status, statusMessage) {
14
14
  const { buildId } = config;
15
+ const payload = {};
16
+
17
+ if (status) {
18
+ payload.status = status;
19
+ }
20
+ if (statusMessage) {
21
+ payload.statusMessage = statusMessage;
22
+ }
15
23
 
16
24
  return request({
17
25
  method: 'PUT',
18
26
  url: `${config.apiUri}/v4/builds/${buildId}`,
19
- json: {
20
- status,
21
- statusMessage
22
- },
27
+ json: payload,
23
28
  context: {
24
29
  token: config.token
25
30
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "screwdriver-buildcluster-queue-worker",
3
- "version": "5.0.2",
3
+ "version": "5.2.0",
4
4
  "description": "An amqp connection manager implementation that consumes jobs from Rabbitmq queue.",
5
5
  "main": "index.js",
6
6
  "scripts": {
@@ -49,7 +49,7 @@
49
49
  "fs": "0.0.2",
50
50
  "fs-extra": "^11.1.0",
51
51
  "path": "^0.12.7",
52
- "screwdriver-executor-k8s": "^17.1.0",
52
+ "screwdriver-executor-k8s": "^17.3.1",
53
53
  "screwdriver-executor-k8s-vm": "^5.0.0",
54
54
  "screwdriver-executor-router": "^5.0.0",
55
55
  "screwdriver-logger": "^3.0.0",
package/receiver.js CHANGED
@@ -16,11 +16,13 @@ const {
16
16
  cacheStrategy,
17
17
  cachePath,
18
18
  retryQueue,
19
- retryQueueEnabled
19
+ retryQueueEnabled,
20
+ initTimeout
20
21
  } = config.getConfig();
21
22
  const { spawn } = threads;
22
23
  const CACHE_STRATEGY_DISK = 'disk';
23
24
  let channelWrapper;
25
+ const INIT_TIMEOUT = initTimeout * 60 * 1000; // milliseconds
24
26
 
25
27
  /**
26
28
  * onMessage consume messages in batches, once its available in the queue. channelWrapper has in-built back pressure
@@ -103,18 +105,68 @@ const onMessage = data => {
103
105
  }
104
106
  }
105
107
 
108
+ let timeoutWarningLogged = false;
109
+ let timeoutTimer = null;
110
+
111
+ if (jobType === 'start') {
112
+ timeoutTimer = setTimeout(async () => {
113
+ if (!timeoutWarningLogged) {
114
+ timeoutWarningLogged = true;
115
+ const timeoutMessage = `Build initialization timeout exceeded (${initTimeout}min) for ${job}`;
116
+
117
+ logger.error(timeoutMessage);
118
+
119
+ // Update build statusmessage only to show delayed initialization
120
+ try {
121
+ await helper.updateBuildStatusAsync(
122
+ buildConfig,
123
+ undefined,
124
+ 'Build initialization delayed - pod creation taking longer than expected'
125
+ );
126
+ logger.info(`Build status updated with delay warning for build ${buildId}`);
127
+ } catch (err) {
128
+ logger.error(
129
+ `Failed to update build status with delay warning for build:${buildId}:${err}`
130
+ );
131
+ }
132
+
133
+ // Push to retry queue for verification and potential failure
134
+ // This allows verify to check pod status and fail if still pending
135
+ logger.info(`Pushing ${job} to retry queue for verification after timeout`);
136
+ retryQueueLib.push(buildConfig, buildId);
137
+ }
138
+ }, INIT_TIMEOUT);
139
+ }
140
+
106
141
  thread
107
142
  .send([jobType, buildConfig, job])
108
143
  .on('message', successful => {
109
144
  logger.info(`acknowledge, job completed for ${job}, result: ${successful}`);
145
+
110
146
  if (!successful && jobType === 'start') {
111
- // push to retry only for start jobs
147
+ // Pod failed immediately (status check returned false)
148
+ // Clear timeout and push to retry queue for immediate verification
149
+ if (timeoutTimer) {
150
+ clearTimeout(timeoutTimer);
151
+ }
112
152
  retryQueueLib.push(buildConfig, buildId);
153
+ } else if (successful && jobType === 'start') {
154
+ // Pod created successfully - DON'T clear timeout
155
+ // Let the timeout fire to verify pod eventually started
156
+ // This handles pods that get stuck in pending after creation
157
+ logger.info(`Timeout remains active for ${job}, will verify after ${initTimeout}min`);
158
+ } else if (timeoutTimer) {
159
+ // For non-start jobs (stop, verify), or other cases, clear timeout normally
160
+ clearTimeout(timeoutTimer);
113
161
  }
162
+
114
163
  channelWrapper.ack(data);
115
164
  thread.kill();
116
165
  })
117
166
  .on('error', async error => {
167
+ if (timeoutTimer) {
168
+ clearTimeout(timeoutTimer);
169
+ }
118
170
  thread.kill();
119
171
  if (['403', '404'].includes(error.message.substring(0, 3))) {
120
172
  channelWrapper.ack(data);
@@ -236,7 +288,7 @@ const listen = async () => {
236
288
  const queueFn = [channel.checkQueue(queue), channel.prefetch(prefetchCount), channel.consume(queue, onMessage)];
237
289
 
238
290
  if (retryQueueEnabled) {
239
- queueFn.concat([channel.checkQueue(retryQueue), channel.consume(retryQueue, onRetryMessage)]);
291
+ queueFn.push(channel.checkQueue(retryQueue), channel.consume(retryQueue, onRetryMessage));
240
292
  }
241
293
 
242
294
  return Promise.all(queueFn);