screwdriver-buildcluster-queue-worker 5.2.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,15 +11,14 @@ npm install screwdriver-buildcluster-queue-worker
11
11
 
12
12
  ## Build Start Workflow
13
13
 
14
- The queue worker processes build start messages from RabbitMQ and manages pod lifecycle in Kubernetes.
15
-
16
- > **See [WORKFLOW.md](WORKFLOW.md) for detailed workflow diagram with retry behavior**
14
+ The queue worker processes build start messages from RabbitMQ and manages pod lifecycle in Kubernetes with **smart retry logic** and **progressive backoff**.
17
15
 
16
+ > **See [WORKFLOW.md](WORKFLOW.md) for detailed workflow diagram with retry behavior and queue configuration**
18
17
  ### Configuration
19
18
 
20
19
  - `prefetchCount`: 20 messages per worker (default)
21
- - `buildInitTimeout`: 5 minutes (default)
22
- - `messageReprocessLimit`: 5 retries in retry queue (default)
20
+ - `initTimeout`: 5 minutes (default)
21
+ - `messageReprocessLimit`: 6 retries in retry queue (default)
23
22
 
24
23
  ## Testing
25
24
 
package/WORKFLOW.md CHANGED
@@ -1,166 +1,350 @@
1
- # Build Start Workflow - Detailed Flow
1
+ # Build Messages Processing Workflow
2
2
 
3
- ## Main Queue Processing
3
+ ## Overview
4
+
5
+ This document describes the **queue-based retry mechanism** for build pod initialization with **progressive backoff** and **smart status distinction**. The system uses RabbitMQ's native message TTL and dead-letter exchange features with per-message TTL for variable delays
6
+ and simulate delayed queue behavior for message verification.
7
+
8
+ ### Key Features
9
+
10
+ - **Status Code Distinction**: Separates pod scheduling issues (`waiting`) from image pull delays (`initializing`)
11
+ - **Progressive Backoff**: Increasing retry delays for large image downloads (30s → 80s)
12
+ - **Timeout Tracking**: Only pod scheduling delays count against the 3-minute SLO
13
+ - **Per-Message TTL**: Allows different retry delays for different scenarios
14
+ - **Two-Queue Pattern**: Wait queue (`sdRetryQueue-wait`) with TTL → Ready queue (`sdRetryQueue`)
15
+
16
+ ## Architecture
4
17
 
5
18
  ```
6
19
  ┌─────────────────────────────────────────────────────────────────────────────┐
7
- MAIN QUEUE: Message Processing
20
+ │ QUEUE TOPOLOGY
21
+ └─────────────────────────────────────────────────────────────────────────────┘
22
+
23
+ queue-service (Redis/Resque)
24
+
25
+
26
+ ┌───────────────────────────────────────────────────────┐
27
+ │ RabbitMQ Exchange: "build" (topic) │
28
+ └───────────────────────────────────────────────────────┘
29
+
30
+ ├─────────────────┬──────────────────┬──────────────────┬──────────────────┐
31
+ │ │ │ │ │
32
+ ▼ ▼ ▼ ▼ ▼
33
+ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐
34
+ │ sd │ │ sdRetry │ │sdRetry-wait │ │ sddlr │ │ default │
35
+ │ (main queue) │ │ (ready queue)│ │ (wait queue) │ │ (delay/retry)│ │ (catch-all) │
36
+ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘
37
+ │ │ │ │
38
+ │ start/stop │ verify │ per-msg TTL │ delay 5s
39
+ │ TTL: 8hr │ NO queue TTL │ 30s-80s │ then → sd
40
+ │ DLX → gq1dlr │ (consumers) │ DLX → sdretry │
41
+ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘
42
+
43
+ │ (after per-msg TTL expires)
44
+ └────────► sdretry
45
+ ```
46
+
47
+ ## Main Queue Processing (sd)
48
+
49
+ ```
50
+ ┌─────────────────────────────────────────────────────────────────────────────┐
51
+ │ MAIN QUEUE: Start/Stop Job Processing │
8
52
  └─────────────────────────────────────────────────────────────────────────────┘
9
53
 
10
54
  ┌──────────────────┐
11
55
  │ Receive Message │
56
+ │ from sd │
12
57
  │ (prefetch=20) │
13
58
  └────────┬─────────┘
14
59
 
15
- ├─────────────────────────────────────────────────────────┐
16
- │ │
17
- ┌────────▼─────────┐ ┌─────────▼─────────┐
18
- Start Timeout │ Spawn Thread │
19
- (5 min timer) │ │ Call _start()
20
- └──────────────────┘ └─────────┬─────────┘
21
- │ │
22
- │ ┌──────────▼──────────┐
23
- │ │ Try Create K8s Pod │
24
- │ │ (POST to K8s API) │
25
- │ └──────────┬──────────┘
26
- │ │
27
- │ ┌────────────────────┼───────────────┐
28
- │ │ │ │
29
- │ ┌──────────▼─────────┐ ┌──────▼─────────────────────┐
30
- │ │ Success (201) │ │ API Error (500/503/etc) │
31
- │ │ Pod Created! │ │ Network error, K8s down │
32
- │ └──────────┬─────────┘ └──────┬─────────────────────┘
33
- │ │ │
34
- │ ┌──────────▼──────────────┐ ┌─▼──────────────────────┐
35
- │ │ Check Pod Status │ │ THROW EXCEPTION │
36
- │ │ (GET pod/status) │ │ "Failed to create pod" │
37
- │ └──────────┬──────────────┘ └─┬──────────────────────┘
38
- │ │ │
39
- │ ┌─────────────┼─────────────┐ │ .on('error')
40
- │ │ │ │ │
41
- │ ┌──────────▼─────┐ ┌───▼───┐ ┌─────▼──────┐▼──────────────────┐
42
- │ │ Pod Status: │ │ Pod: │ │ Pod Status:││ Retry < 5? │
43
- │ │ pending/running│ │failed │ │ unknown ││ YES: NACK (retry)│
44
- │ └──────────┬─────┘ └───┬───┘ └─────┬──────┘│ NO: FAILURE+ACK │
45
- │ │ │ │ └──────────────────┘
46
- │ ┌──────────▼─────┐ ┌───▼─────────────▼───┐
47
- │ │ Return TRUE │ │ Return FALSE │
48
- │ │ "Pod OK" │ │ "Status check failed"│
49
- │ └──────────┬─────┘ └───┬──────────────────┘
50
- │ │ │
51
- │ ┌──────────▼─────┐ ┌───▼──────────────┐
52
- │ │ ACK message │ │ Clear timeout │
53
- │ │ (free prefetch)│ │ ACK message │
54
- │ └──────────┬─────┘ │ Push to RETRY │
55
- │ │ │ QUEUE (verify) │
56
- │ ┌──────────▼─────┐ └───┬──────────────┘
57
- │ │ DON'T clear │ │
58
- │ │ timeout! │ │
59
- │ │ (keep monitor) │ │
60
- │ └──────────┬─────┘ │
61
- │ │ │
62
- │◄─────────────────────┘ │
63
- │ │
64
- ┌────────▼─────────┐ │
65
- │ Wait 5 minutes │ │
66
- └────────┬─────────┘ │
67
- │ │
68
- ┌────────▼───────────────────────┐ │
69
- │ Timeout Fires! │ │
70
- │ Update build statusmessage: │ │
71
- │ "Build initialization delayed" │ │
72
- └────────┬───────────────────────┘ │
73
- │ │
74
- ┌────────▼─────────┐ │
75
- │ Push to │◄───────────────────────┘
76
- │ RETRY QUEUE │
60
+
61
+ ┌──────────────────┐
62
+ │ Parse Message │
63
+ jobType: start
64
+ stop
65
+ │ clear │
77
66
  └────────┬─────────┘
78
67
 
68
+ ├──────────────────────────────────────┐
69
+ │ │
70
+ ┌────────▼─────────┐ ┌────────▼─────────┐
71
+ │ jobType=start │ │ jobType=stop │
72
+ │ │ │ jobType=clear │
73
+ └────────┬─────────┘ └────────┬─────────┘
74
+ │ │
75
+ ┌────────▼─────────┐ ┌────────▼─────────┐
76
+ │ Spawn Thread │ │ Spawn Thread │
77
+ │ Call _start() │ │ Execute job │
78
+ └────────┬─────────┘ └────────┬─────────┘
79
+ │ │
80
+ ┌────────▼────────────────┐ │
81
+ │ Create K8s Pod │ │
82
+ │ (POST to K8s API) │ │
83
+ └────────┬────────────────┘ │
84
+ │ │
85
+ ┌────────┼──────────────────┐ │
86
+ │ │ │ │
87
+ ▼ ▼ ▼ │
88
+ ┌─────────────┐ ┌──────────────────┐ │
89
+ │ Success │ │ K8s API Error │ │
90
+ │ (201) │ │ Network timeout │ │
91
+ └─────┬───────┘ └──────────┬───────┘ │
92
+ │ │ │
93
+ │ ▼ │
94
+ │ ┌────────────────────┐ │
95
+ │ │ .on('error') │ │
96
+ │ │ retryCount < 3? │ │
97
+ │ │ YES: NACK (retry) │ │
98
+ │ │ NO: FAILURE + ACK │ │
99
+ │ └────────────────────┘ │
100
+ │ │
101
+ ▼ │
102
+ ┌──────────────────────────────┐ │
103
+ │ Pod created successfully │ │
104
+ │ .on('message') │ │
105
+ └──────────┬───────────────────┘ │
106
+ │ │
107
+ ▼ │
108
+ ┌──────────────────────────────┐ │
109
+ │ ACK message immediately │◄──────────────────┘
110
+ │ (free up prefetch slot) │
111
+ └──────────┬───────────────────┘
112
+
113
+
114
+ ┌─────────────────────────────────────────────────────────────┐
115
+ │ Push to sdretry-wait for verification │
116
+ │ - Add header: x-build-start-time = Date.now() │
117
+ │ - Add header: x-retry-count = 0 │
118
+ │ - Set per-message TTL: 30 seconds (expiration property) │
119
+ │ - Publishes to: sdretry-wait (not sdretry directly!) │
120
+ └──────────┬──────────────────────────────────────────────────┘
121
+
122
+
123
+
124
+ ┌─────────────────────────────────────────────────────────────┐
125
+ │ WAIT QUEUE: sdretry-wait (waits for TTL to expire) │
126
+ │ - Message sits here for TTL duration (30s default) │
127
+ │ - When TTL expires → Dead-letter to sdretry │
128
+ └─────────────────────────────────────────────────────────────┘
129
+
130
+ │ (after TTL expires)
131
+
132
+ ┌─────────────────────────────────────────────────────────────┐
133
+ │ RETRY QUEUE: sdretry (ready for consumption) │
134
+ │ - Consumer picks up message for pod verification │
135
+ └─────────────────────────────────────────────────────────────┘
136
+ ```
137
+
138
+ ## Retry Queue Processing (sdretry)
139
+
140
+ ```
141
+ ┌─────────────────────────────────────────────────────────────────────────────┐
142
+ │ RETRY QUEUE: Pod Verification & Status Check │
143
+ └─────────────────────────────────────────────────────────────────────────────┘
144
+
145
+ ┌──────────────────────────────────┐
146
+ │ Consumer picks up message │
147
+ │ from sdretry │
148
+ │ Headers: x-build-start-time │
149
+ │ x-retry-count │
150
+ └──────────┬───────────────────────┘
151
+
152
+
153
+ ┌──────────────────────────────────┐
154
+ │ Check retry count │
155
+ │ retryCount = x-retry-count || 0 │
156
+ │ if retryCount >= 6: FAIL │
157
+ └──────────┬───────────────────────┘
158
+
159
+
160
+ ┌──────────────────────────────────┐
161
+ │ Spawn Thread │
162
+ │ Call _verify() │
163
+ └──────────┬───────────────────────┘
164
+
165
+ ┌──────────▼──────────────┐
166
+ │ Get Pod Status │
167
+ │ (GET pods?labelSelector)│
168
+ └──────────┬──────────────┘
169
+
170
+ ┌──────────┼────────────────────────────────┐
171
+ │ │ │
172
+ ▼ ▼ ▼
173
+ ┌─────────────────────┐ ┌──────────────────────────┐
174
+ │ Status: 'waiting' │ │ Status: 'initializing' │
175
+ │ (pod not scheduled) │ │ (pod pulling image) │
176
+ └──────────┬──────────┘ └──────────┬───────────────┘
177
+ │ │
178
+ ▼ ▼
179
+ ┌────────────────────────────────────────────────────┐
180
+ │ Check Init Timeout │
181
+ │ ONLY for 'waiting' │
182
+ │ elapsed = now - x-build-start-time │
183
+ │ if elapsed >= 3min: TIMEOUT │
184
+ └────────────┬───────────────────────────────────────┘
79
185
 
80
- ┌────────────▼─────────────────────────────────────────────────────────────────┐
81
- RETRY QUEUE: Pod Verification
82
- └──────────────────────────────────────────────────────────────────────────────┘
83
-
84
- ┌────────────────────┐
85
- Receive Message
86
- │ from Retry Queue │
87
- └─────────┬──────────┘
88
-
89
- ┌─────────▼──────────┐
90
- Spawn Thread
91
- Call _verify()
92
- └─────────┬──────────┘
93
-
94
- ┌─────────▼────────────────┐
95
- │ Try Get Pod Status │
96
- (GET pods?labelSelector) │
97
- └─────────┬────────────────┘
98
-
99
- ┌─────────┼────────────────────────────┐
100
-
101
- ┌───▼─────────────┐ ┌─────────▼────────────────┐
102
- Success │ │ API Error (K8s API down)
103
- │ Got pod status │ │ Network issue │
104
- └───┬─────────────┘ └─────────┬────────────────┘
105
- │ │
106
- │ ┌─────────▼────────────────┐
107
- THROW EXCEPTION
108
- .on('error')
109
- └─────────┬────────────────┘
110
- │ │
111
- │ ┌─────────▼────────────────┐
112
- │ │ Retry < 5? │
113
- │ │ YES: NACK (retry verify)
114
- │ │ NO: FAILURE + ACK │
115
- │ └──────────────────────────┘
116
-
117
-
118
- ┌─────────────────────────────────────────────────────────────────┐
119
- │ Check Pod Status & Container Waiting Reason │
120
- └─────────┬────────────────────────────────────────────────────────┘
121
-
122
- ┌─────┴──────────┬────────────────┬───────────────┬─────────────────┐
123
- │ │ │ │ │
124
- ┌───▼────────────┐ ┌▼──────────┐ ┌─▼────────────┐ ┌▼───────────────┐ ┌▼──────────────┐
125
- │ Pod Status: │ │ Pod: │ │ Pod: │ Pod: │ │ Pod: │
126
- │ running/ │ │ failed/ │ │ pending + │ │ pending + │ │ pending + │
127
- succeeded │ unknown │ │ ErrImagePull │ │ CrashLoopBack │ │ PodInitializing│
128
- └───┬────────────┘ └┬──────────┘ └─┬────────────┘ └┬───────────────┘ └┬──────────────┘
129
- │ │ │ │ │
130
- ┌───▼────────────┐ ┌▼────────────────────────────────▼──────────────────▼──────────────┐
131
- Return EMPTY │ Return ERROR MESSAGE │
132
- (success) │ │ "Build failed to start..."
133
- └───┬────────────┘ └┬───────────────────────────────────────────────────────────────────┘
134
-
135
- ┌───▼────────────┐ ┌▼────────────────┐ ┌─────────▼──────────┐
136
- │ ACK message │ │ Update build to │ Return EMPTY │
137
- (build OK) │ │ FAILURE │ (allow more time │
138
- └────────────────┘ ACK message │ for image pull) │
139
- └─────────────────┘ └─────────┬──────────┘
140
-
141
- ┌─────────▼──────────┐
142
- ACK message │
143
- (pod still healthy
144
- may take 10+ min)
145
- └────────────────────┘
186
+ ┌────────┼─────────┐
187
+
188
+ ▼ ▼ ▼
189
+ ┌─────────────┐ ┌──────────────┐
190
+ │ Timeout! │ │ Within time │
191
+ elapsed>=3m │ elapsed<3m │
192
+ └─────┬───────┘ └──────┬───────┘
193
+ │ │
194
+ ▼ ▼
195
+ ┌──────────────────┐ ┌────────────────────────────────┐
196
+ FAIL BUILD │ Retry with appropriate delay │
197
+ "Pod scheduling │ │
198
+ │ timeout exceeded"│ │ 'waiting': Fixed 30s delay │
199
+ ACK + Stop │ │ 'initializing': Progressive │
200
+ └──────────────────┘ │ 30s + (retryCount × 10s) │
201
+ └─────────┬──────────────────────┘
202
+
203
+
204
+ ┌───────────────────────────────┐
205
+ │ ACK current message │
206
+ Publish to sdretry-wait
207
+ │ with new TTL (expiration) │
208
+ and x-retry-count += 1
209
+ └───────────┬───────────────────┘
210
+
211
+
212
+ ┌───────────────────────────────┐
213
+ Message waits in sdretry-wait
214
+ for TTL duration
215
+ Then dead-letter → sdretry │
216
+ └───────────────────────────────┘
217
+
218
+ Other status codes:
219
+ '' (empty string) → ACK (success, pod running)
220
+ Error message → ACK + Update build → FAILURE
221
+ ```
222
+
223
+ ## Pod Status Decision Tree
224
+
225
+ ```
226
+ ┌─────────────────────────────────────────────────────────────────────────────┐
227
+ POD VERIFICATION LOGIC (_verify in executor-k8s/index.js) │
228
+ └─────────────────────────────────────────────────────────────────────────────┘
229
+
230
+ Check Pod Status
231
+
232
+ ┌────┴──────────────────────────────────────────────────┐
233
+
234
+ ▼ ▼
235
+ Container Waiting Reason? Pod Phase?
236
+ │ │
237
+ ├─ ErrImagePull ──────────┐
238
+ ├─ ImagePullBackOff ───────┼────► FAIL FAST
239
+ ├─ InvalidImageName ────────┘ "Check your image" │
240
+
241
+ ├─ CrashLoopBackOff ───────┐ │
242
+ ├─ CreateContainerError ────┼────► FAIL FAST
243
+ ├─ StartError ──────────────┘ "Contact admin"
244
+
245
+ └─ (none/other) ────────────────────────────────────────┼──► Check phase
246
+
247
+ ├─ Running ──────► SUCCESS ('')
248
+ ├─ Succeeded ────► SUCCESS ('')
249
+ ├─ Failed ───────► FAILURE (error msg)
250
+ ├─ Unknown ──────► FAILURE (error msg)
251
+
252
+ └─ Pending ──┐
253
+
254
+ ┌───────────────▼──────────────┐
255
+ │ Has nodeName assigned? │
256
+ └───────────────┬──────────────┘
257
+
258
+ ┌────────────────────┼────────────────────┐
259
+ │ │ │
260
+ ▼ ▼ ▼
261
+ ┌───────────────┐ ┌─────────────────┐ ┌──────────────┐
262
+ │ nodeName: NO │ │ nodeName: YES │ │ Other cases │
263
+ │ (not sched) │ │ (initializing) │ │ │
264
+ └───────┬───────┘ └────────┬────────┘ └──────┬───────┘
265
+ │ │ │
266
+ ▼ ▼ ▼
267
+ ┌───────────────┐ ┌─────────────────┐ ┌──────────────┐
268
+ │ Return │ │ Return │ │ Fail or │
269
+ │ 'waiting' │ │ 'initializing' │ │ other status │
270
+ │ │ │ │ │ │
271
+ │ (pod waiting │ │ (pod pulling │ └──────────────┘
272
+ │ to schedule) │ │ image) │
273
+ └───────────────┘ └─────────────────┘
274
+
275
+ Status Code Meanings:
276
+ - '' (empty string) → Pod is running successfully
277
+ - 'waiting' → Pod not scheduled (counts against 3min timeout)
278
+ - 'initializing' → Pod pulling image (progressive backoff, no timeout)
279
+ - Error message string → Immediate failure (ImagePullBackOff, CrashLoopBackOff, etc.)
280
+ ```
281
+
282
+ ## Queue Configuration
283
+
284
+ ### RabbitMQ Queue Definitions
285
+
286
+ **sdQueue** (main queue for consumers):
287
+ ```json
288
+ {
289
+ "name": "sdQueue",
290
+ "vhost": "screwdriver",
291
+ "durable": true,
292
+ "auto_delete": false,
293
+ "arguments": {
294
+ "x-dead-letter-exchange": "build",
295
+ "x-dead-letter-routing-key": "sdQueuedlr",
296
+ "x-max-priority": 3,
297
+ "x-message-ttl": 28800000
298
+ }
299
+ }
300
+ ```
301
+ **sdQueue** (DLR queue for consumers, for messages that fail to be ACK'd):
302
+ ```json
303
+ {
304
+ "name": "sdQueuedlr",
305
+ "vhost": "screwdriver",
306
+ "durable": true,
307
+ "auto_delete": false,
308
+ "arguments": {
309
+ "x-dead-letter-exchange": "build",
310
+ "x-dead-letter-routing-key": "sdQueue",
311
+ "x-max-priority": 3,
312
+ "x-message-ttl": 5000,
313
+ "x-queue-mode": "lazy"
314
+ }
315
+ }
316
+ ```
317
+
318
+ **sdRetryQueue** (ready queue for consumers):
319
+ ```json
320
+ {
321
+ "name": "sdRetryQueue",
322
+ "vhost": "screwdriver",
323
+ "durable": true,
324
+ "auto_delete": false,
325
+ "arguments": {
326
+ "x-max-priority": 3,
327
+ "x-queue-type": "classic"
328
+ }
329
+ }
330
+ ```
331
+
332
+ **IMPORTANT**: `sdRetryQueue` must NOT have `x-message-ttl` to allow per-message TTL!
333
+
334
+ **sdRetryQueue-wait** (wait queue with dead-letter routing):
335
+ ```json
336
+ {
337
+ "name": "sdretry-wait",
338
+ "vhost": "screwdriver",
339
+ "durable": true,
340
+ "auto_delete": false,
341
+ "arguments": {
342
+ "x-dead-letter-exchange": "build",
343
+ "x-dead-letter-routing-key": "sdretry",
344
+ "x-max-priority": 3,
345
+ "x-queue-type": "classic"
346
+ }
347
+ }
146
348
  ```
147
349
 
148
- ## Key Points
149
-
150
- ### Main Queue Retries (NACK):
151
- - **When**: Pod creation throws exception (K8s API error, network issue)
152
- - **Why**: Pod was never created, safe to retry
153
- - **How many**: Up to 5 times via RabbitMQ requeue
154
- - **After max retries**: Update build to FAILURE and ACK
155
-
156
- ### Retry Queue Retries (NACK):
157
- - **When**: _verify() throws exception (can't get pod status from K8s)
158
- - **Why**: Transient API issue, pod might be fine
159
- - **How many**: Up to 5 times via RabbitMQ requeue
160
- - **After max retries**: Update build to FAILURE and ACK
161
-
162
- ### No Retries (ACK immediately):
163
- - Pod created successfully (pending/running status) → main queue
164
- - Pod status check failed (pod exists but failed/unknown) → main queue → retry queue
165
- - Verify detects failed pod (returns error message) → retry queue
166
- - Verify detects healthy pod (returns empty) → retry queue
350
+ # TODO: Use Delayed queue plugin https://github.com/rabbitmq/rabbitmq-delayed-message-exchange
@@ -359,12 +359,16 @@ rabbitmq:
359
359
  messageReprocessLimit: RABBITMQ_MSG_REPROCESS_LIMIT
360
360
  # Queue name of the retry queue
361
361
  retryQueue: RABBITMQ_RETRYQUEUE
362
+ # Queue name of the delayed retry queue
363
+ retryDelayedQueue: RABBITMQ_RETRYDELAYEDQUEUE
362
364
  # retry queue enable/disable flag
363
365
  retryQueueEnabled: RABBITMQ_RETRYQUEUE_ENABLED
364
366
  # Exchange / router name for rabbitmq
365
367
  exchange: RABBITMQ_EXCHANGE
366
368
  # build pod initialization timeout
367
369
  initTimeout: RABBITMQ_BUILD_INIT_TIMEOUT
370
+ # delay between retries in seconds
371
+ retryDelay: RABBITMQ_RETRY_DELAY
368
372
  httpd:
369
373
  # Port to listen on
370
374
  port: PORT
@@ -240,15 +240,19 @@ rabbitmq:
240
240
  # Prefetch count
241
241
  prefetchCount: "20"
242
242
  # Message reprocess limit - max retry for a message
243
- messageReprocessLimit: "3"
243
+ messageReprocessLimit: "6" # short wait but more retries
244
244
  # Queue name of the retry queue
245
245
  retryQueue: sdRetryQueue
246
+ # Queue name of the delayed retry queue
247
+ retryDelayedQueue: sdRetryQueue-wait
246
248
  # retry queue enable/disable flag
247
249
  retryQueueEnabled: false
248
250
  # Exchange / router name for rabbitmq
249
251
  exchange: build
250
252
  # build pod initialization timeout in minutes
251
253
  initTimeout: "5"
254
+ # delay between retries in seconds
255
+ retryDelay: "30"
252
256
  httpd:
253
257
  # Port to listen on
254
258
  port: 80
package/lib/config.js CHANGED
@@ -18,9 +18,11 @@ const {
18
18
  prefetchCount,
19
19
  messageReprocessLimit,
20
20
  retryQueue,
21
+ retryDelayedQueue,
21
22
  retryQueueEnabled,
22
23
  exchange,
23
- initTimeout
24
+ initTimeout,
25
+ retryDelay
24
26
  } = rabbitmqConfig;
25
27
  const amqpURI = `${protocol}://${username}:${password}@${host}:${port}${vhost}`;
26
28
 
@@ -60,7 +62,9 @@ function getConfig() {
60
62
  retryQueue,
61
63
  retryQueueEnabled: convertToBool(retryQueueEnabled),
62
64
  exchange,
63
- initTimeout: Number(initTimeout) || 5
65
+ initTimeout: Number(initTimeout) || 5,
66
+ retryDelay: Number(retryDelay) || 30,
67
+ retryDelayedQueue
64
68
  };
65
69
  }
66
70
 
@@ -3,7 +3,7 @@
3
3
  const amqp = require('amqp-connection-manager');
4
4
  const logger = require('screwdriver-logger');
5
5
  const config = require('./config');
6
- const { amqpURI, connectOptions, retryQueue, exchange, retryQueueEnabled } = config.getConfig();
6
+ const { amqpURI, connectOptions, exchange, retryQueueEnabled, retryDelayedQueue } = config.getConfig();
7
7
 
8
8
  let retryQueueConn;
9
9
 
@@ -29,12 +29,16 @@ function getRetryQueueConn() {
29
29
  }
30
30
 
31
31
  /**
32
- * Pushes a message to the retry queue
32
+ * Pushes a message to the retry wait queue (delay queue)
33
+ * Messages will sit in the wait queue for the specified delay before being routed to the ready queue
33
34
  * @param {message} buildConfig build config
34
35
  * @param {messageId} messageId id of the message queue
36
+ * @param {number} delayMs delay in milliseconds (supports dynamic delays for progressive backoff)
37
+ * @param {number} retryCount current retry count (optional, defaults to 0)
38
+ * @param {number} buildStartTime timestamp when build verification started (optional)
35
39
  * @returns {Promise} resolves to null or error
36
40
  */
37
- async function push(buildConfig, messageId) {
41
+ async function push(buildConfig, messageId, delayMs = 30000, retryCount = 0, buildStartTime = null) {
38
42
  if (!retryQueueEnabled) {
39
43
  return Promise.resolve();
40
44
  }
@@ -49,20 +53,37 @@ async function push(buildConfig, messageId) {
49
53
  setup: channel => channel.checkExchange(exchange)
50
54
  });
51
55
 
52
- logger.info('publishing msg to retry queue: %s', messageId);
56
+ // Publish to the WAIT queue, not the ready queue with per-message TTL
57
+ const waitQueue = retryDelayedQueue;
58
+ const delaySec = (delayMs / 1000).toFixed(0);
59
+
60
+ logger.info('publishing msg to retry wait queue: %s (will delay %ss)', messageId, delaySec);
61
+
62
+ // Add headers for timeout tracking and retry count
63
+ const headers = {
64
+ 'x-build-start-time': buildStartTime || Date.now(),
65
+ 'x-retry-count': retryCount
66
+ };
53
67
 
54
68
  return channelWrapper
55
- .publish(exchange, retryQueue, message, {
69
+ .publish(exchange, waitQueue, message, {
56
70
  contentType: 'application/json',
57
- persistent: true
71
+ persistent: true,
72
+ headers,
73
+ expiration: String(delayMs)
58
74
  })
59
75
  .then(() => {
60
- logger.info('successfully publishing msg id %s -> queue %s', messageId, retryQueue);
76
+ logger.info(
77
+ 'successfully published msg id %s -> wait queue %s (delay: %ss)',
78
+ messageId,
79
+ waitQueue,
80
+ delaySec
81
+ );
61
82
 
62
83
  return channelWrapper.close();
63
84
  })
64
85
  .catch(err => {
65
- logger.error('publishing failed to retry queue: %s', err.message);
86
+ logger.error('publishing failed to retry wait queue: %s', err.message);
66
87
  channelWrapper.close();
67
88
 
68
89
  throw err;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "screwdriver-buildcluster-queue-worker",
3
- "version": "5.2.0",
3
+ "version": "6.0.0",
4
4
  "description": "An amqp connection manager implementation that consumes jobs from Rabbitmq queue.",
5
5
  "main": "index.js",
6
6
  "scripts": {
package/receiver.js CHANGED
@@ -17,12 +17,13 @@ const {
17
17
  cachePath,
18
18
  retryQueue,
19
19
  retryQueueEnabled,
20
- initTimeout
20
+ exchange,
21
+ initTimeout,
22
+ retryDelay
21
23
  } = config.getConfig();
22
24
  const { spawn } = threads;
23
25
  const CACHE_STRATEGY_DISK = 'disk';
24
26
  let channelWrapper;
25
- const INIT_TIMEOUT = initTimeout * 60 * 1000; // milliseconds
26
27
 
27
28
  /**
28
29
  * onMessage consume messages in batches, once its available in the queue. channelWrapper has in-built back pressure
@@ -105,68 +106,22 @@ const onMessage = data => {
105
106
  }
106
107
  }
107
108
 
108
- let timeoutWarningLogged = false;
109
- let timeoutTimer = null;
110
-
111
- if (jobType === 'start') {
112
- timeoutTimer = setTimeout(async () => {
113
- if (!timeoutWarningLogged) {
114
- timeoutWarningLogged = true;
115
- const timeoutMessage = `Build initialization timeout exceeded (${initTimeout}min) for ${job}`;
116
-
117
- logger.error(timeoutMessage);
118
-
119
- // Update build statusmessage only to show delayed initialization
120
- try {
121
- await helper.updateBuildStatusAsync(
122
- buildConfig,
123
- undefined,
124
- 'Build initialization delayed - pod creation taking longer than expected'
125
- );
126
- logger.info(`Build status updated with delay warning for build ${buildId}`);
127
- } catch (err) {
128
- logger.error(
129
- `Failed to update build status with delay warning for build:${buildId}:${err}`
130
- );
131
- }
132
-
133
- // Push to retry queue for verification and potential failure
134
- // This allows verify to check pod status and fail if still pending
135
- logger.info(`Pushing ${job} to retry queue for verification after timeout`);
136
- retryQueueLib.push(buildConfig, buildId);
137
- }
138
- }, INIT_TIMEOUT);
139
- }
140
-
141
109
  thread
142
110
  .send([jobType, buildConfig, job])
143
111
  .on('message', successful => {
144
112
  logger.info(`acknowledge, job completed for ${job}, result: ${successful}`);
145
113
 
146
- if (!successful && jobType === 'start') {
147
- // Pod failed immediately (status check returned false)
148
- // Clear timeout and push to retry queue for immediate verification
149
- if (timeoutTimer) {
150
- clearTimeout(timeoutTimer);
151
- }
152
- retryQueueLib.push(buildConfig, buildId);
153
- } else if (successful && jobType === 'start') {
154
- // Pod created successfully - DON'T clear timeout
155
- // Let the timeout fire to verify pod eventually started
156
- // This handles pods that get stuck in pending after creation
157
- logger.info(`Timeout remains active for ${job}, will verify after ${initTimeout}min`);
158
- } else if (timeoutTimer) {
159
- // For non-start jobs (stop, verify), or other cases, clear timeout normally
160
- clearTimeout(timeoutTimer);
114
+ if (jobType === 'start') {
115
+ logger.info(`Pushing ${job} to retry queue for verification`);
116
+ retryQueueLib.push(buildConfig, buildId).catch(err => {
117
+ logger.error(`Failed to push to retry queue for ${job}: ${err.message}`);
118
+ });
161
119
  }
162
120
 
163
121
  channelWrapper.ack(data);
164
122
  thread.kill();
165
123
  })
166
124
  .on('error', async error => {
167
- if (timeoutTimer) {
168
- clearTimeout(timeoutTimer);
169
- }
170
125
  thread.kill();
171
126
  if (['403', '404'].includes(error.message.substring(0, 3))) {
172
127
  channelWrapper.ack(data);
@@ -220,25 +175,155 @@ const onRetryMessage = async data => {
220
175
 
221
176
  logger.info(`processing ${job}`);
222
177
 
178
+ const buildStartTime =
179
+ data.properties.headers && data.properties.headers['x-build-start-time']
180
+ ? data.properties.headers['x-build-start-time']
181
+ : null;
182
+ const initTimeoutMs = initTimeout * 60 * 1000;
183
+
223
184
  if (typeof data.properties.headers !== 'undefined') {
224
185
  if (Object.keys(data.properties.headers).length > 0) {
225
- retryCount = data.properties.headers['x-death'][0].count;
226
- logger.info(`retrying ${retryCount}(${messageReprocessLimit}) for ${job}`);
186
+ if (data.properties.headers['x-retry-count']) {
187
+ retryCount = data.properties.headers['x-retry-count'];
188
+ logger.info(`retrying ${retryCount}(${messageReprocessLimit}) for ${job}`);
189
+ } else if (data.properties.headers['x-death']) {
190
+ retryCount = data.properties.headers['x-death'][0].count;
191
+ logger.info(`retrying ${retryCount}(${messageReprocessLimit}) for ${job}`);
192
+ }
227
193
  }
228
194
  }
195
+
229
196
  thread
230
197
  .send([jobType, buildConfig, job])
231
198
  .on('message', async message => {
232
199
  logger.info(`acknowledge, job completed for ${job}, result: ${message}`);
233
- if (message) {
200
+
201
+ if (message === 'waiting') {
202
+ // Pod not scheduled - check timeout
203
+ if (buildStartTime) {
204
+ const elapsedMs = Date.now() - buildStartTime;
205
+ const elapsedMinutes = (elapsedMs / 1000 / 60).toFixed(2);
206
+
207
+ logger.info(
208
+ `Build ${buildId} pod not scheduled yet, elapsed: ${elapsedMinutes}min, timeout: ${initTimeout}min`
209
+ );
210
+
211
+ if (elapsedMs >= initTimeoutMs) {
212
+ // Timeout exceeded - fail immediately
213
+ logger.error(
214
+ `Build ${buildId} pod scheduling timeout exceeded: ${elapsedMinutes}min > ${initTimeout}min`
215
+ );
216
+
217
+ // metric for alerting
218
+ logger.error(
219
+ `[BUILD_SCHEDULING_FAILURE] buildId=${buildId} elapsed_minutes=${elapsedMinutes} ` +
220
+ `timeout_minutes=${initTimeout} retry_count=${retryCount}`
221
+ );
222
+
223
+ try {
224
+ await helper.updateBuildStatusAsync(
225
+ buildConfig,
226
+ 'FAILURE',
227
+ `Build failed to start within ${initTimeout} minutes (elapsed: ${elapsedMinutes} minutes). Pod was not scheduled - cluster may be out of capacity.`
228
+ );
229
+ logger.info(`Build ${buildId} marked as FAILURE due to pod scheduling timeout`);
230
+ } catch (err) {
231
+ logger.error(`Failed to update build status to FAILURE for build:${buildId}:${err}`);
232
+ }
233
+ channelWrapper.ack(data);
234
+ thread.kill();
235
+
236
+ return;
237
+ }
238
+ }
239
+
240
+ // Timeout not exceeded - retry with delay
241
+ if (retryCount >= messageReprocessLimit) {
242
+ logger.error(
243
+ `Build ${buildId} max retries (${messageReprocessLimit}) exceeded while waiting for pod scheduling`
244
+ );
245
+
246
+ // metric for alerting
247
+ logger.error(
248
+ `[BUILD_SCHEDULING_FAILURE] buildId=${buildId} elapsed_minutes=` +
249
+ `${((Date.now() - buildStartTime) / 1000 / 60).toFixed(2)} max_retries=${retryCount}`
250
+ );
251
+
252
+ try {
253
+ await helper.updateBuildStatusAsync(
254
+ buildConfig,
255
+ 'FAILURE',
256
+ 'Build failed to start. Pod was not scheduled after maximum retries - cluster may be out of capacity.'
257
+ );
258
+ logger.info(`Build ${buildId} marked as FAILURE due to max retries`);
259
+ } catch (err) {
260
+ logger.error(`Failed to update build status to FAILURE for build:${buildId}:${err}`);
261
+ }
262
+ channelWrapper.ack(data);
263
+ } else {
264
+ const nextRetryCount = retryCount + 1;
265
+
266
+ logger.info(
267
+ `Build ${buildId} pod not scheduled, retrying ${nextRetryCount}/${messageReprocessLimit} in ${retryDelay}s`
268
+ );
269
+ channelWrapper.ack(data);
270
+
271
+ // Re-publish to retry queue with incremented retry count
272
+ retryQueueLib
273
+ .push(buildConfig, buildId, retryDelay * 1000, nextRetryCount, buildStartTime)
274
+ .catch(err => {
275
+ logger.error(`Failed to re-publish to retry queue for ${job}: ${err.message}`);
276
+ });
277
+ }
278
+ } else if (message === 'initializing') {
279
+ // Pod is initializing (pulling image) - use progressive backoff for large images
280
+ if (retryCount >= messageReprocessLimit) {
281
+ logger.error(
282
+ `Build ${buildId} max retries (${messageReprocessLimit}) exceeded while pod initializing/pulling image`
283
+ );
284
+ try {
285
+ await helper.updateBuildStatusAsync(
286
+ buildConfig,
287
+ 'FAILURE',
288
+ 'Build failed to start. Pod initialization timeout - pod may be stuck pulling a large image or container startup is slow.'
289
+ );
290
+ logger.info(`Build ${buildId} marked as FAILURE due to max retries during initialization`);
291
+ } catch (err) {
292
+ logger.error(`Failed to update build status to FAILURE for build:${buildId}:${err}`);
293
+ }
294
+ channelWrapper.ack(data);
295
+ } else {
296
+ const nextRetryCount = retryCount + 1;
297
+
298
+ const baseDelayMs = retryDelay * 1000;
299
+ const incrementMs = 10000 * retryCount;
300
+ const delayMs = baseDelayMs + incrementMs;
301
+ const delaySec = (delayMs / 1000).toFixed(0);
302
+
303
+ logger.info(
304
+ `Build ${buildId} pod still initializing/pulling image, retrying ${nextRetryCount}/${messageReprocessLimit} in ${delaySec}s (progressive backoff)`
305
+ );
306
+ channelWrapper.ack(data);
307
+
308
+ // Re-publish to retry queue with incremented retry count and progressive delay
309
+ retryQueueLib.push(buildConfig, buildId, delayMs, nextRetryCount, buildStartTime).catch(err => {
310
+ logger.error(`Failed to re-publish to retry queue for ${job}: ${err.message}`);
311
+ });
312
+ }
313
+ } else if (message && message !== '') {
314
+ // Pod has failed - update build status and ack
234
315
  try {
235
316
  await helper.updateBuildStatusAsync(buildConfig, 'FAILURE', message);
236
317
  logger.info(`build status successfully updated for build ${buildId}`);
237
318
  } catch (err) {
238
319
  logger.error(`Failed to update build status to FAILURE for build:${buildId}:${err}`);
239
320
  }
321
+ channelWrapper.ack(data);
322
+ } else {
323
+ // Empty string means pod is running successfully - ack
324
+ logger.info(`pod started successfully for ${job}, acknowledging`);
325
+ channelWrapper.ack(data);
240
326
  }
241
- channelWrapper.ack(data);
242
327
  thread.kill();
243
328
  })
244
329
  .on('error', async error => {
@@ -288,7 +373,11 @@ const listen = async () => {
288
373
  const queueFn = [channel.checkQueue(queue), channel.prefetch(prefetchCount), channel.consume(queue, onMessage)];
289
374
 
290
375
  if (retryQueueEnabled) {
291
- queueFn.push(channel.checkQueue(retryQueue), channel.consume(retryQueue, onRetryMessage));
376
+ queueFn.push(
377
+ channel.checkQueue(retryQueue),
378
+ channel.bindQueue(retryQueue, exchange, retryQueue),
379
+ channel.consume(retryQueue, onRetryMessage)
380
+ );
292
381
  }
293
382
 
294
383
  return Promise.all(queueFn);