screwdriver-buildcluster-queue-worker 5.2.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -5
- package/WORKFLOW.md +334 -150
- package/config/custom-environment-variables.yaml +4 -0
- package/config/default.yaml +5 -1
- package/lib/config.js +6 -2
- package/lib/retry-queue.js +29 -8
- package/package.json +1 -1
- package/receiver.js +147 -58
package/README.md
CHANGED
|
@@ -11,15 +11,14 @@ npm install screwdriver-buildcluster-queue-worker
|
|
|
11
11
|
|
|
12
12
|
## Build Start Workflow
|
|
13
13
|
|
|
14
|
-
The queue worker processes build start messages from RabbitMQ and manages pod lifecycle in Kubernetes
|
|
15
|
-
|
|
16
|
-
> **See [WORKFLOW.md](WORKFLOW.md) for detailed workflow diagram with retry behavior**
|
|
14
|
+
The queue worker processes build start messages from RabbitMQ and manages pod lifecycle in Kubernetes with **smart retry logic** and **progressive backoff**.
|
|
17
15
|
|
|
16
|
+
> **See [WORKFLOW.md](WORKFLOW.md) for detailed workflow diagram with retry behavior and queue configuration**
|
|
18
17
|
### Configuration
|
|
19
18
|
|
|
20
19
|
- `prefetchCount`: 20 messages per worker (default)
|
|
21
|
-
- `
|
|
22
|
-
- `messageReprocessLimit`:
|
|
20
|
+
- `initTimeout`: 5 minutes (default)
|
|
21
|
+
- `messageReprocessLimit`: 6 retries in retry queue (default)
|
|
23
22
|
|
|
24
23
|
## Testing
|
|
25
24
|
|
package/WORKFLOW.md
CHANGED
|
@@ -1,166 +1,350 @@
|
|
|
1
|
-
# Build
|
|
1
|
+
# Build Messages Processing Workflow
|
|
2
2
|
|
|
3
|
-
##
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
This document describes the **queue-based retry mechanism** for build pod initialization with **progressive backoff** and **smart status distinction**. The system uses RabbitMQ's native message TTL and dead-letter exchange features with per-message TTL for variable delays
|
|
6
|
+
and simulate delayed queue behavior for message verification.
|
|
7
|
+
|
|
8
|
+
### Key Features
|
|
9
|
+
|
|
10
|
+
- **Status Code Distinction**: Separates pod scheduling issues (`waiting`) from image pull delays (`initializing`)
|
|
11
|
+
- **Progressive Backoff**: Increasing retry delays for large image downloads (30s → 80s)
|
|
12
|
+
- **Timeout Tracking**: Only pod scheduling delays count against the 3-minute SLO
|
|
13
|
+
- **Per-Message TTL**: Allows different retry delays for different scenarios
|
|
14
|
+
- **Two-Queue Pattern**: Wait queue (`sdRetryQueue-wait`) with TTL → Ready queue (`sdRetryQueue`)
|
|
15
|
+
|
|
16
|
+
## Architecture
|
|
4
17
|
|
|
5
18
|
```
|
|
6
19
|
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
7
|
-
│
|
|
20
|
+
│ QUEUE TOPOLOGY │
|
|
21
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
22
|
+
|
|
23
|
+
queue-service (Redis/Resque)
|
|
24
|
+
│
|
|
25
|
+
▼
|
|
26
|
+
┌───────────────────────────────────────────────────────┐
|
|
27
|
+
│ RabbitMQ Exchange: "build" (topic) │
|
|
28
|
+
└───────────────────────────────────────────────────────┘
|
|
29
|
+
│
|
|
30
|
+
├─────────────────┬──────────────────┬──────────────────┬──────────────────┐
|
|
31
|
+
│ │ │ │ │
|
|
32
|
+
▼ ▼ ▼ ▼ ▼
|
|
33
|
+
┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
|
34
|
+
│ sd │ │ sdRetry │ │sdRetry-wait │ │ sddlr │ │ default │
|
|
35
|
+
│ (main queue) │ │ (ready queue)│ │ (wait queue) │ │ (delay/retry)│ │ (catch-all) │
|
|
36
|
+
└──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘
|
|
37
|
+
│ │ │ │
|
|
38
|
+
│ start/stop │ verify │ per-msg TTL │ delay 5s
|
|
39
|
+
│ TTL: 8hr │ NO queue TTL │ 30s-80s │ then → sd
|
|
40
|
+
│ DLX → gq1dlr │ (consumers) │ DLX → sdretry │
|
|
41
|
+
└──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘
|
|
42
|
+
│
|
|
43
|
+
│ (after per-msg TTL expires)
|
|
44
|
+
└────────► sdretry
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Main Queue Processing (sd)
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
51
|
+
│ MAIN QUEUE: Start/Stop Job Processing │
|
|
8
52
|
└─────────────────────────────────────────────────────────────────────────────┘
|
|
9
53
|
|
|
10
54
|
┌──────────────────┐
|
|
11
55
|
│ Receive Message │
|
|
56
|
+
│ from sd │
|
|
12
57
|
│ (prefetch=20) │
|
|
13
58
|
└────────┬─────────┘
|
|
14
59
|
│
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
│
|
|
19
|
-
│
|
|
20
|
-
|
|
21
|
-
│ │
|
|
22
|
-
│ ┌──────────▼──────────┐
|
|
23
|
-
│ │ Try Create K8s Pod │
|
|
24
|
-
│ │ (POST to K8s API) │
|
|
25
|
-
│ └──────────┬──────────┘
|
|
26
|
-
│ │
|
|
27
|
-
│ ┌────────────────────┼───────────────┐
|
|
28
|
-
│ │ │ │
|
|
29
|
-
│ ┌──────────▼─────────┐ ┌──────▼─────────────────────┐
|
|
30
|
-
│ │ Success (201) │ │ API Error (500/503/etc) │
|
|
31
|
-
│ │ Pod Created! │ │ Network error, K8s down │
|
|
32
|
-
│ └──────────┬─────────┘ └──────┬─────────────────────┘
|
|
33
|
-
│ │ │
|
|
34
|
-
│ ┌──────────▼──────────────┐ ┌─▼──────────────────────┐
|
|
35
|
-
│ │ Check Pod Status │ │ THROW EXCEPTION │
|
|
36
|
-
│ │ (GET pod/status) │ │ "Failed to create pod" │
|
|
37
|
-
│ └──────────┬──────────────┘ └─┬──────────────────────┘
|
|
38
|
-
│ │ │
|
|
39
|
-
│ ┌─────────────┼─────────────┐ │ .on('error')
|
|
40
|
-
│ │ │ │ │
|
|
41
|
-
│ ┌──────────▼─────┐ ┌───▼───┐ ┌─────▼──────┐▼──────────────────┐
|
|
42
|
-
│ │ Pod Status: │ │ Pod: │ │ Pod Status:││ Retry < 5? │
|
|
43
|
-
│ │ pending/running│ │failed │ │ unknown ││ YES: NACK (retry)│
|
|
44
|
-
│ └──────────┬─────┘ └───┬───┘ └─────┬──────┘│ NO: FAILURE+ACK │
|
|
45
|
-
│ │ │ │ └──────────────────┘
|
|
46
|
-
│ ┌──────────▼─────┐ ┌───▼─────────────▼───┐
|
|
47
|
-
│ │ Return TRUE │ │ Return FALSE │
|
|
48
|
-
│ │ "Pod OK" │ │ "Status check failed"│
|
|
49
|
-
│ └──────────┬─────┘ └───┬──────────────────┘
|
|
50
|
-
│ │ │
|
|
51
|
-
│ ┌──────────▼─────┐ ┌───▼──────────────┐
|
|
52
|
-
│ │ ACK message │ │ Clear timeout │
|
|
53
|
-
│ │ (free prefetch)│ │ ACK message │
|
|
54
|
-
│ └──────────┬─────┘ │ Push to RETRY │
|
|
55
|
-
│ │ │ QUEUE (verify) │
|
|
56
|
-
│ ┌──────────▼─────┐ └───┬──────────────┘
|
|
57
|
-
│ │ DON'T clear │ │
|
|
58
|
-
│ │ timeout! │ │
|
|
59
|
-
│ │ (keep monitor) │ │
|
|
60
|
-
│ └──────────┬─────┘ │
|
|
61
|
-
│ │ │
|
|
62
|
-
│◄─────────────────────┘ │
|
|
63
|
-
│ │
|
|
64
|
-
┌────────▼─────────┐ │
|
|
65
|
-
│ Wait 5 minutes │ │
|
|
66
|
-
└────────┬─────────┘ │
|
|
67
|
-
│ │
|
|
68
|
-
┌────────▼───────────────────────┐ │
|
|
69
|
-
│ Timeout Fires! │ │
|
|
70
|
-
│ Update build statusmessage: │ │
|
|
71
|
-
│ "Build initialization delayed" │ │
|
|
72
|
-
└────────┬───────────────────────┘ │
|
|
73
|
-
│ │
|
|
74
|
-
┌────────▼─────────┐ │
|
|
75
|
-
│ Push to │◄───────────────────────┘
|
|
76
|
-
│ RETRY QUEUE │
|
|
60
|
+
▼
|
|
61
|
+
┌──────────────────┐
|
|
62
|
+
│ Parse Message │
|
|
63
|
+
│ jobType: start │
|
|
64
|
+
│ stop │
|
|
65
|
+
│ clear │
|
|
77
66
|
└────────┬─────────┘
|
|
78
67
|
│
|
|
68
|
+
├──────────────────────────────────────┐
|
|
69
|
+
│ │
|
|
70
|
+
┌────────▼─────────┐ ┌────────▼─────────┐
|
|
71
|
+
│ jobType=start │ │ jobType=stop │
|
|
72
|
+
│ │ │ jobType=clear │
|
|
73
|
+
└────────┬─────────┘ └────────┬─────────┘
|
|
74
|
+
│ │
|
|
75
|
+
┌────────▼─────────┐ ┌────────▼─────────┐
|
|
76
|
+
│ Spawn Thread │ │ Spawn Thread │
|
|
77
|
+
│ Call _start() │ │ Execute job │
|
|
78
|
+
└────────┬─────────┘ └────────┬─────────┘
|
|
79
|
+
│ │
|
|
80
|
+
┌────────▼────────────────┐ │
|
|
81
|
+
│ Create K8s Pod │ │
|
|
82
|
+
│ (POST to K8s API) │ │
|
|
83
|
+
└────────┬────────────────┘ │
|
|
84
|
+
│ │
|
|
85
|
+
┌────────┼──────────────────┐ │
|
|
86
|
+
│ │ │ │
|
|
87
|
+
▼ ▼ ▼ │
|
|
88
|
+
┌─────────────┐ ┌──────────────────┐ │
|
|
89
|
+
│ Success │ │ K8s API Error │ │
|
|
90
|
+
│ (201) │ │ Network timeout │ │
|
|
91
|
+
└─────┬───────┘ └──────────┬───────┘ │
|
|
92
|
+
│ │ │
|
|
93
|
+
│ ▼ │
|
|
94
|
+
│ ┌────────────────────┐ │
|
|
95
|
+
│ │ .on('error') │ │
|
|
96
|
+
│ │ retryCount < 3? │ │
|
|
97
|
+
│ │ YES: NACK (retry) │ │
|
|
98
|
+
│ │ NO: FAILURE + ACK │ │
|
|
99
|
+
│ └────────────────────┘ │
|
|
100
|
+
│ │
|
|
101
|
+
▼ │
|
|
102
|
+
┌──────────────────────────────┐ │
|
|
103
|
+
│ Pod created successfully │ │
|
|
104
|
+
│ .on('message') │ │
|
|
105
|
+
└──────────┬───────────────────┘ │
|
|
106
|
+
│ │
|
|
107
|
+
▼ │
|
|
108
|
+
┌──────────────────────────────┐ │
|
|
109
|
+
│ ACK message immediately │◄──────────────────┘
|
|
110
|
+
│ (free up prefetch slot) │
|
|
111
|
+
└──────────┬───────────────────┘
|
|
112
|
+
│
|
|
113
|
+
▼
|
|
114
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
115
|
+
│ Push to sdretry-wait for verification │
|
|
116
|
+
│ - Add header: x-build-start-time = Date.now() │
|
|
117
|
+
│ - Add header: x-retry-count = 0 │
|
|
118
|
+
│ - Set per-message TTL: 30 seconds (expiration property) │
|
|
119
|
+
│ - Publishes to: sdretry-wait (not sdretry directly!) │
|
|
120
|
+
└──────────┬──────────────────────────────────────────────────┘
|
|
121
|
+
│
|
|
122
|
+
│
|
|
123
|
+
▼
|
|
124
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
125
|
+
│ WAIT QUEUE: sdretry-wait (waits for TTL to expire) │
|
|
126
|
+
│ - Message sits here for TTL duration (30s default) │
|
|
127
|
+
│ - When TTL expires → Dead-letter to sdretry │
|
|
128
|
+
└─────────────────────────────────────────────────────────────┘
|
|
129
|
+
│
|
|
130
|
+
│ (after TTL expires)
|
|
131
|
+
▼
|
|
132
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
133
|
+
│ RETRY QUEUE: sdretry (ready for consumption) │
|
|
134
|
+
│ - Consumer picks up message for pod verification │
|
|
135
|
+
└─────────────────────────────────────────────────────────────┘
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Retry Queue Processing (sdretry)
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
142
|
+
│ RETRY QUEUE: Pod Verification & Status Check │
|
|
143
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
144
|
+
|
|
145
|
+
┌──────────────────────────────────┐
|
|
146
|
+
│ Consumer picks up message │
|
|
147
|
+
│ from sdretry │
|
|
148
|
+
│ Headers: x-build-start-time │
|
|
149
|
+
│ x-retry-count │
|
|
150
|
+
└──────────┬───────────────────────┘
|
|
151
|
+
│
|
|
152
|
+
▼
|
|
153
|
+
┌──────────────────────────────────┐
|
|
154
|
+
│ Check retry count │
|
|
155
|
+
│ retryCount = x-retry-count || 0 │
|
|
156
|
+
│ if retryCount >= 6: FAIL │
|
|
157
|
+
└──────────┬───────────────────────┘
|
|
158
|
+
│
|
|
159
|
+
▼
|
|
160
|
+
┌──────────────────────────────────┐
|
|
161
|
+
│ Spawn Thread │
|
|
162
|
+
│ Call _verify() │
|
|
163
|
+
└──────────┬───────────────────────┘
|
|
164
|
+
│
|
|
165
|
+
┌──────────▼──────────────┐
|
|
166
|
+
│ Get Pod Status │
|
|
167
|
+
│ (GET pods?labelSelector)│
|
|
168
|
+
└──────────┬──────────────┘
|
|
169
|
+
│
|
|
170
|
+
┌──────────┼────────────────────────────────┐
|
|
171
|
+
│ │ │
|
|
172
|
+
▼ ▼ ▼
|
|
173
|
+
┌─────────────────────┐ ┌──────────────────────────┐
|
|
174
|
+
│ Status: 'waiting' │ │ Status: 'initializing' │
|
|
175
|
+
│ (pod not scheduled) │ │ (pod pulling image) │
|
|
176
|
+
└──────────┬──────────┘ └──────────┬───────────────┘
|
|
177
|
+
│ │
|
|
178
|
+
▼ ▼
|
|
179
|
+
┌────────────────────────────────────────────────────┐
|
|
180
|
+
│ Check Init Timeout │
|
|
181
|
+
│ ONLY for 'waiting' │
|
|
182
|
+
│ elapsed = now - x-build-start-time │
|
|
183
|
+
│ if elapsed >= 3min: TIMEOUT │
|
|
184
|
+
└────────────┬───────────────────────────────────────┘
|
|
79
185
|
│
|
|
80
|
-
|
|
81
|
-
│
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
│
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
│
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
│
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
186
|
+
┌────────┼─────────┐
|
|
187
|
+
│ │ │
|
|
188
|
+
▼ ▼ ▼
|
|
189
|
+
┌─────────────┐ ┌──────────────┐
|
|
190
|
+
│ Timeout! │ │ Within time │
|
|
191
|
+
│ elapsed>=3m │ │ elapsed<3m │
|
|
192
|
+
└─────┬───────┘ └──────┬───────┘
|
|
193
|
+
│ │
|
|
194
|
+
▼ ▼
|
|
195
|
+
┌──────────────────┐ ┌────────────────────────────────┐
|
|
196
|
+
│ FAIL BUILD │ │ Retry with appropriate delay │
|
|
197
|
+
│ "Pod scheduling │ │ │
|
|
198
|
+
│ timeout exceeded"│ │ 'waiting': Fixed 30s delay │
|
|
199
|
+
│ ACK + Stop │ │ 'initializing': Progressive │
|
|
200
|
+
└──────────────────┘ │ 30s + (retryCount × 10s) │
|
|
201
|
+
└─────────┬──────────────────────┘
|
|
202
|
+
│
|
|
203
|
+
▼
|
|
204
|
+
┌───────────────────────────────┐
|
|
205
|
+
│ ACK current message │
|
|
206
|
+
│ Publish to sdretry-wait │
|
|
207
|
+
│ with new TTL (expiration) │
|
|
208
|
+
│ and x-retry-count += 1 │
|
|
209
|
+
└───────────┬───────────────────┘
|
|
210
|
+
│
|
|
211
|
+
▼
|
|
212
|
+
┌───────────────────────────────┐
|
|
213
|
+
│ Message waits in sdretry-wait│
|
|
214
|
+
│ for TTL duration │
|
|
215
|
+
│ Then dead-letter → sdretry │
|
|
216
|
+
└───────────────────────────────┘
|
|
217
|
+
|
|
218
|
+
Other status codes:
|
|
219
|
+
'' (empty string) → ACK (success, pod running)
|
|
220
|
+
Error message → ACK + Update build → FAILURE
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Pod Status Decision Tree
|
|
224
|
+
|
|
225
|
+
```
|
|
226
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
227
|
+
│ POD VERIFICATION LOGIC (_verify in executor-k8s/index.js) │
|
|
228
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
229
|
+
|
|
230
|
+
Check Pod Status
|
|
231
|
+
│
|
|
232
|
+
┌────┴──────────────────────────────────────────────────┐
|
|
233
|
+
│ │
|
|
234
|
+
▼ ▼
|
|
235
|
+
Container Waiting Reason? Pod Phase?
|
|
236
|
+
│ │
|
|
237
|
+
├─ ErrImagePull ──────────┐ │
|
|
238
|
+
├─ ImagePullBackOff ───────┼────► FAIL FAST │
|
|
239
|
+
├─ InvalidImageName ────────┘ "Check your image" │
|
|
240
|
+
│ │
|
|
241
|
+
├─ CrashLoopBackOff ───────┐ │
|
|
242
|
+
├─ CreateContainerError ────┼────► FAIL FAST │
|
|
243
|
+
├─ StartError ──────────────┘ "Contact admin" │
|
|
244
|
+
│ │
|
|
245
|
+
└─ (none/other) ────────────────────────────────────────┼──► Check phase
|
|
246
|
+
│
|
|
247
|
+
├─ Running ──────► SUCCESS ('')
|
|
248
|
+
├─ Succeeded ────► SUCCESS ('')
|
|
249
|
+
├─ Failed ───────► FAILURE (error msg)
|
|
250
|
+
├─ Unknown ──────► FAILURE (error msg)
|
|
251
|
+
│
|
|
252
|
+
└─ Pending ──┐
|
|
253
|
+
│
|
|
254
|
+
┌───────────────▼──────────────┐
|
|
255
|
+
│ Has nodeName assigned? │
|
|
256
|
+
└───────────────┬──────────────┘
|
|
257
|
+
│
|
|
258
|
+
┌────────────────────┼────────────────────┐
|
|
259
|
+
│ │ │
|
|
260
|
+
▼ ▼ ▼
|
|
261
|
+
┌───────────────┐ ┌─────────────────┐ ┌──────────────┐
|
|
262
|
+
│ nodeName: NO │ │ nodeName: YES │ │ Other cases │
|
|
263
|
+
│ (not sched) │ │ (initializing) │ │ │
|
|
264
|
+
└───────┬───────┘ └────────┬────────┘ └──────┬───────┘
|
|
265
|
+
│ │ │
|
|
266
|
+
▼ ▼ ▼
|
|
267
|
+
┌───────────────┐ ┌─────────────────┐ ┌──────────────┐
|
|
268
|
+
│ Return │ │ Return │ │ Fail or │
|
|
269
|
+
│ 'waiting' │ │ 'initializing' │ │ other status │
|
|
270
|
+
│ │ │ │ │ │
|
|
271
|
+
│ (pod waiting │ │ (pod pulling │ └──────────────┘
|
|
272
|
+
│ to schedule) │ │ image) │
|
|
273
|
+
└───────────────┘ └─────────────────┘
|
|
274
|
+
|
|
275
|
+
Status Code Meanings:
|
|
276
|
+
- '' (empty string) → Pod is running successfully
|
|
277
|
+
- 'waiting' → Pod not scheduled (counts against 3min timeout)
|
|
278
|
+
- 'initializing' → Pod pulling image (progressive backoff, no timeout)
|
|
279
|
+
- Error message string → Immediate failure (ImagePullBackOff, CrashLoopBackOff, etc.)
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
## Queue Configuration
|
|
283
|
+
|
|
284
|
+
### RabbitMQ Queue Definitions
|
|
285
|
+
|
|
286
|
+
**sdQueue** (main queue for consumers):
|
|
287
|
+
```json
|
|
288
|
+
{
|
|
289
|
+
"name": "sdQueue",
|
|
290
|
+
"vhost": "screwdriver",
|
|
291
|
+
"durable": true,
|
|
292
|
+
"auto_delete": false,
|
|
293
|
+
"arguments": {
|
|
294
|
+
"x-dead-letter-exchange": "build",
|
|
295
|
+
"x-dead-letter-routing-key": "sdQueuedlr",
|
|
296
|
+
"x-max-priority": 3,
|
|
297
|
+
"x-message-ttl": 28800000
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
```
|
|
301
|
+
**sdQueue** (DLR queue for consumers, for messages that fail to be ACK'd):
|
|
302
|
+
```json
|
|
303
|
+
{
|
|
304
|
+
"name": "sdQueuedlr",
|
|
305
|
+
"vhost": "screwdriver",
|
|
306
|
+
"durable": true,
|
|
307
|
+
"auto_delete": false,
|
|
308
|
+
"arguments": {
|
|
309
|
+
"x-dead-letter-exchange": "build",
|
|
310
|
+
"x-dead-letter-routing-key": "sdQueue",
|
|
311
|
+
"x-max-priority": 3,
|
|
312
|
+
"x-message-ttl": 5000,
|
|
313
|
+
"x-queue-mode": "lazy"
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
**sdRetryQueue** (ready queue for consumers):
|
|
319
|
+
```json
|
|
320
|
+
{
|
|
321
|
+
"name": "sdRetryQueue",
|
|
322
|
+
"vhost": "screwdriver",
|
|
323
|
+
"durable": true,
|
|
324
|
+
"auto_delete": false,
|
|
325
|
+
"arguments": {
|
|
326
|
+
"x-max-priority": 3,
|
|
327
|
+
"x-queue-type": "classic"
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
**IMPORTANT**: `sdRetryQueue` must NOT have `x-message-ttl` to allow per-message TTL!
|
|
333
|
+
|
|
334
|
+
**sdRetryQueue-wait** (wait queue with dead-letter routing):
|
|
335
|
+
```json
|
|
336
|
+
{
|
|
337
|
+
"name": "sdretry-wait",
|
|
338
|
+
"vhost": "screwdriver",
|
|
339
|
+
"durable": true,
|
|
340
|
+
"auto_delete": false,
|
|
341
|
+
"arguments": {
|
|
342
|
+
"x-dead-letter-exchange": "build",
|
|
343
|
+
"x-dead-letter-routing-key": "sdretry",
|
|
344
|
+
"x-max-priority": 3,
|
|
345
|
+
"x-queue-type": "classic"
|
|
346
|
+
}
|
|
347
|
+
}
|
|
146
348
|
```
|
|
147
349
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
### Main Queue Retries (NACK):
|
|
151
|
-
- **When**: Pod creation throws exception (K8s API error, network issue)
|
|
152
|
-
- **Why**: Pod was never created, safe to retry
|
|
153
|
-
- **How many**: Up to 5 times via RabbitMQ requeue
|
|
154
|
-
- **After max retries**: Update build to FAILURE and ACK
|
|
155
|
-
|
|
156
|
-
### Retry Queue Retries (NACK):
|
|
157
|
-
- **When**: _verify() throws exception (can't get pod status from K8s)
|
|
158
|
-
- **Why**: Transient API issue, pod might be fine
|
|
159
|
-
- **How many**: Up to 5 times via RabbitMQ requeue
|
|
160
|
-
- **After max retries**: Update build to FAILURE and ACK
|
|
161
|
-
|
|
162
|
-
### No Retries (ACK immediately):
|
|
163
|
-
- Pod created successfully (pending/running status) → main queue
|
|
164
|
-
- Pod status check failed (pod exists but failed/unknown) → main queue → retry queue
|
|
165
|
-
- Verify detects failed pod (returns error message) → retry queue
|
|
166
|
-
- Verify detects healthy pod (returns empty) → retry queue
|
|
350
|
+
# TODO: Use Delayed queue plugin https://github.com/rabbitmq/rabbitmq-delayed-message-exchange
|
|
@@ -359,12 +359,16 @@ rabbitmq:
|
|
|
359
359
|
messageReprocessLimit: RABBITMQ_MSG_REPROCESS_LIMIT
|
|
360
360
|
# Queue name of the retry queue
|
|
361
361
|
retryQueue: RABBITMQ_RETRYQUEUE
|
|
362
|
+
# Queue name of the delayed retry queue
|
|
363
|
+
retryDelayedQueue: RABBITMQ_RETRYDELAYEDQUEUE
|
|
362
364
|
# retry queue enable/disable flag
|
|
363
365
|
retryQueueEnabled: RABBITMQ_RETRYQUEUE_ENABLED
|
|
364
366
|
# Exchange / router name for rabbitmq
|
|
365
367
|
exchange: RABBITMQ_EXCHANGE
|
|
366
368
|
# build pod initialization timeout
|
|
367
369
|
initTimeout: RABBITMQ_BUILD_INIT_TIMEOUT
|
|
370
|
+
# delay between retries in seconds
|
|
371
|
+
retryDelay: RABBITMQ_RETRY_DELAY
|
|
368
372
|
httpd:
|
|
369
373
|
# Port to listen on
|
|
370
374
|
port: PORT
|
package/config/default.yaml
CHANGED
|
@@ -240,15 +240,19 @@ rabbitmq:
|
|
|
240
240
|
# Prefetch count
|
|
241
241
|
prefetchCount: "20"
|
|
242
242
|
# Message reprocess limit - max retry for a message
|
|
243
|
-
messageReprocessLimit: "
|
|
243
|
+
messageReprocessLimit: "6" # short wait but more retries
|
|
244
244
|
# Queue name of the retry queue
|
|
245
245
|
retryQueue: sdRetryQueue
|
|
246
|
+
# Queue name of the delayed retry queue
|
|
247
|
+
retryDelayedQueue: sdRetryQueue-wait
|
|
246
248
|
# retry queue enable/disable flag
|
|
247
249
|
retryQueueEnabled: false
|
|
248
250
|
# Exchange / router name for rabbitmq
|
|
249
251
|
exchange: build
|
|
250
252
|
# build pod initialization timeout in minutes
|
|
251
253
|
initTimeout: "5"
|
|
254
|
+
# delay between retries in seconds
|
|
255
|
+
retryDelay: "30"
|
|
252
256
|
httpd:
|
|
253
257
|
# Port to listen on
|
|
254
258
|
port: 80
|
package/lib/config.js
CHANGED
|
@@ -18,9 +18,11 @@ const {
|
|
|
18
18
|
prefetchCount,
|
|
19
19
|
messageReprocessLimit,
|
|
20
20
|
retryQueue,
|
|
21
|
+
retryDelayedQueue,
|
|
21
22
|
retryQueueEnabled,
|
|
22
23
|
exchange,
|
|
23
|
-
initTimeout
|
|
24
|
+
initTimeout,
|
|
25
|
+
retryDelay
|
|
24
26
|
} = rabbitmqConfig;
|
|
25
27
|
const amqpURI = `${protocol}://${username}:${password}@${host}:${port}${vhost}`;
|
|
26
28
|
|
|
@@ -60,7 +62,9 @@ function getConfig() {
|
|
|
60
62
|
retryQueue,
|
|
61
63
|
retryQueueEnabled: convertToBool(retryQueueEnabled),
|
|
62
64
|
exchange,
|
|
63
|
-
initTimeout: Number(initTimeout) || 5
|
|
65
|
+
initTimeout: Number(initTimeout) || 5,
|
|
66
|
+
retryDelay: Number(retryDelay) || 30,
|
|
67
|
+
retryDelayedQueue
|
|
64
68
|
};
|
|
65
69
|
}
|
|
66
70
|
|
package/lib/retry-queue.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
const amqp = require('amqp-connection-manager');
|
|
4
4
|
const logger = require('screwdriver-logger');
|
|
5
5
|
const config = require('./config');
|
|
6
|
-
const { amqpURI, connectOptions,
|
|
6
|
+
const { amqpURI, connectOptions, exchange, retryQueueEnabled, retryDelayedQueue } = config.getConfig();
|
|
7
7
|
|
|
8
8
|
let retryQueueConn;
|
|
9
9
|
|
|
@@ -29,12 +29,16 @@ function getRetryQueueConn() {
|
|
|
29
29
|
}
|
|
30
30
|
|
|
31
31
|
/**
|
|
32
|
-
* Pushes a message to the retry queue
|
|
32
|
+
* Pushes a message to the retry wait queue (delay queue)
|
|
33
|
+
* Messages will sit in the wait queue for the specified delay before being routed to the ready queue
|
|
33
34
|
* @param {message} buildConfig build config
|
|
34
35
|
* @param {messageId} messageId id of the message queue
|
|
36
|
+
* @param {number} delayMs delay in milliseconds (supports dynamic delays for progressive backoff)
|
|
37
|
+
* @param {number} retryCount current retry count (optional, defaults to 0)
|
|
38
|
+
* @param {number} buildStartTime timestamp when build verification started (optional)
|
|
35
39
|
* @returns {Promise} resolves to null or error
|
|
36
40
|
*/
|
|
37
|
-
async function push(buildConfig, messageId) {
|
|
41
|
+
async function push(buildConfig, messageId, delayMs = 30000, retryCount = 0, buildStartTime = null) {
|
|
38
42
|
if (!retryQueueEnabled) {
|
|
39
43
|
return Promise.resolve();
|
|
40
44
|
}
|
|
@@ -49,20 +53,37 @@ async function push(buildConfig, messageId) {
|
|
|
49
53
|
setup: channel => channel.checkExchange(exchange)
|
|
50
54
|
});
|
|
51
55
|
|
|
52
|
-
|
|
56
|
+
// Publish to the WAIT queue, not the ready queue with per-message TTL
|
|
57
|
+
const waitQueue = retryDelayedQueue;
|
|
58
|
+
const delaySec = (delayMs / 1000).toFixed(0);
|
|
59
|
+
|
|
60
|
+
logger.info('publishing msg to retry wait queue: %s (will delay %ss)', messageId, delaySec);
|
|
61
|
+
|
|
62
|
+
// Add headers for timeout tracking and retry count
|
|
63
|
+
const headers = {
|
|
64
|
+
'x-build-start-time': buildStartTime || Date.now(),
|
|
65
|
+
'x-retry-count': retryCount
|
|
66
|
+
};
|
|
53
67
|
|
|
54
68
|
return channelWrapper
|
|
55
|
-
.publish(exchange,
|
|
69
|
+
.publish(exchange, waitQueue, message, {
|
|
56
70
|
contentType: 'application/json',
|
|
57
|
-
persistent: true
|
|
71
|
+
persistent: true,
|
|
72
|
+
headers,
|
|
73
|
+
expiration: String(delayMs)
|
|
58
74
|
})
|
|
59
75
|
.then(() => {
|
|
60
|
-
logger.info(
|
|
76
|
+
logger.info(
|
|
77
|
+
'successfully published msg id %s -> wait queue %s (delay: %ss)',
|
|
78
|
+
messageId,
|
|
79
|
+
waitQueue,
|
|
80
|
+
delaySec
|
|
81
|
+
);
|
|
61
82
|
|
|
62
83
|
return channelWrapper.close();
|
|
63
84
|
})
|
|
64
85
|
.catch(err => {
|
|
65
|
-
logger.error('publishing failed to retry queue: %s', err.message);
|
|
86
|
+
logger.error('publishing failed to retry wait queue: %s', err.message);
|
|
66
87
|
channelWrapper.close();
|
|
67
88
|
|
|
68
89
|
throw err;
|
package/package.json
CHANGED
package/receiver.js
CHANGED
|
@@ -17,12 +17,13 @@ const {
|
|
|
17
17
|
cachePath,
|
|
18
18
|
retryQueue,
|
|
19
19
|
retryQueueEnabled,
|
|
20
|
-
|
|
20
|
+
exchange,
|
|
21
|
+
initTimeout,
|
|
22
|
+
retryDelay
|
|
21
23
|
} = config.getConfig();
|
|
22
24
|
const { spawn } = threads;
|
|
23
25
|
const CACHE_STRATEGY_DISK = 'disk';
|
|
24
26
|
let channelWrapper;
|
|
25
|
-
const INIT_TIMEOUT = initTimeout * 60 * 1000; // milliseconds
|
|
26
27
|
|
|
27
28
|
/**
|
|
28
29
|
* onMessage consume messages in batches, once its available in the queue. channelWrapper has in-built back pressure
|
|
@@ -105,68 +106,22 @@ const onMessage = data => {
|
|
|
105
106
|
}
|
|
106
107
|
}
|
|
107
108
|
|
|
108
|
-
let timeoutWarningLogged = false;
|
|
109
|
-
let timeoutTimer = null;
|
|
110
|
-
|
|
111
|
-
if (jobType === 'start') {
|
|
112
|
-
timeoutTimer = setTimeout(async () => {
|
|
113
|
-
if (!timeoutWarningLogged) {
|
|
114
|
-
timeoutWarningLogged = true;
|
|
115
|
-
const timeoutMessage = `Build initialization timeout exceeded (${initTimeout}min) for ${job}`;
|
|
116
|
-
|
|
117
|
-
logger.error(timeoutMessage);
|
|
118
|
-
|
|
119
|
-
// Update build statusmessage only to show delayed initialization
|
|
120
|
-
try {
|
|
121
|
-
await helper.updateBuildStatusAsync(
|
|
122
|
-
buildConfig,
|
|
123
|
-
undefined,
|
|
124
|
-
'Build initialization delayed - pod creation taking longer than expected'
|
|
125
|
-
);
|
|
126
|
-
logger.info(`Build status updated with delay warning for build ${buildId}`);
|
|
127
|
-
} catch (err) {
|
|
128
|
-
logger.error(
|
|
129
|
-
`Failed to update build status with delay warning for build:${buildId}:${err}`
|
|
130
|
-
);
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
// Push to retry queue for verification and potential failure
|
|
134
|
-
// This allows verify to check pod status and fail if still pending
|
|
135
|
-
logger.info(`Pushing ${job} to retry queue for verification after timeout`);
|
|
136
|
-
retryQueueLib.push(buildConfig, buildId);
|
|
137
|
-
}
|
|
138
|
-
}, INIT_TIMEOUT);
|
|
139
|
-
}
|
|
140
|
-
|
|
141
109
|
thread
|
|
142
110
|
.send([jobType, buildConfig, job])
|
|
143
111
|
.on('message', successful => {
|
|
144
112
|
logger.info(`acknowledge, job completed for ${job}, result: ${successful}`);
|
|
145
113
|
|
|
146
|
-
if (
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
}
|
|
152
|
-
retryQueueLib.push(buildConfig, buildId);
|
|
153
|
-
} else if (successful && jobType === 'start') {
|
|
154
|
-
// Pod created successfully - DON'T clear timeout
|
|
155
|
-
// Let the timeout fire to verify pod eventually started
|
|
156
|
-
// This handles pods that get stuck in pending after creation
|
|
157
|
-
logger.info(`Timeout remains active for ${job}, will verify after ${initTimeout}min`);
|
|
158
|
-
} else if (timeoutTimer) {
|
|
159
|
-
// For non-start jobs (stop, verify), or other cases, clear timeout normally
|
|
160
|
-
clearTimeout(timeoutTimer);
|
|
114
|
+
if (jobType === 'start') {
|
|
115
|
+
logger.info(`Pushing ${job} to retry queue for verification`);
|
|
116
|
+
retryQueueLib.push(buildConfig, buildId).catch(err => {
|
|
117
|
+
logger.error(`Failed to push to retry queue for ${job}: ${err.message}`);
|
|
118
|
+
});
|
|
161
119
|
}
|
|
162
120
|
|
|
163
121
|
channelWrapper.ack(data);
|
|
164
122
|
thread.kill();
|
|
165
123
|
})
|
|
166
124
|
.on('error', async error => {
|
|
167
|
-
if (timeoutTimer) {
|
|
168
|
-
clearTimeout(timeoutTimer);
|
|
169
|
-
}
|
|
170
125
|
thread.kill();
|
|
171
126
|
if (['403', '404'].includes(error.message.substring(0, 3))) {
|
|
172
127
|
channelWrapper.ack(data);
|
|
@@ -220,25 +175,155 @@ const onRetryMessage = async data => {
|
|
|
220
175
|
|
|
221
176
|
logger.info(`processing ${job}`);
|
|
222
177
|
|
|
178
|
+
const buildStartTime =
|
|
179
|
+
data.properties.headers && data.properties.headers['x-build-start-time']
|
|
180
|
+
? data.properties.headers['x-build-start-time']
|
|
181
|
+
: null;
|
|
182
|
+
const initTimeoutMs = initTimeout * 60 * 1000;
|
|
183
|
+
|
|
223
184
|
if (typeof data.properties.headers !== 'undefined') {
|
|
224
185
|
if (Object.keys(data.properties.headers).length > 0) {
|
|
225
|
-
|
|
226
|
-
|
|
186
|
+
if (data.properties.headers['x-retry-count']) {
|
|
187
|
+
retryCount = data.properties.headers['x-retry-count'];
|
|
188
|
+
logger.info(`retrying ${retryCount}(${messageReprocessLimit}) for ${job}`);
|
|
189
|
+
} else if (data.properties.headers['x-death']) {
|
|
190
|
+
retryCount = data.properties.headers['x-death'][0].count;
|
|
191
|
+
logger.info(`retrying ${retryCount}(${messageReprocessLimit}) for ${job}`);
|
|
192
|
+
}
|
|
227
193
|
}
|
|
228
194
|
}
|
|
195
|
+
|
|
229
196
|
thread
|
|
230
197
|
.send([jobType, buildConfig, job])
|
|
231
198
|
.on('message', async message => {
|
|
232
199
|
logger.info(`acknowledge, job completed for ${job}, result: ${message}`);
|
|
233
|
-
|
|
200
|
+
|
|
201
|
+
if (message === 'waiting') {
|
|
202
|
+
// Pod not scheduled - check timeout
|
|
203
|
+
if (buildStartTime) {
|
|
204
|
+
const elapsedMs = Date.now() - buildStartTime;
|
|
205
|
+
const elapsedMinutes = (elapsedMs / 1000 / 60).toFixed(2);
|
|
206
|
+
|
|
207
|
+
logger.info(
|
|
208
|
+
`Build ${buildId} pod not scheduled yet, elapsed: ${elapsedMinutes}min, timeout: ${initTimeout}min`
|
|
209
|
+
);
|
|
210
|
+
|
|
211
|
+
if (elapsedMs >= initTimeoutMs) {
|
|
212
|
+
// Timeout exceeded - fail immediately
|
|
213
|
+
logger.error(
|
|
214
|
+
`Build ${buildId} pod scheduling timeout exceeded: ${elapsedMinutes}min > ${initTimeout}min`
|
|
215
|
+
);
|
|
216
|
+
|
|
217
|
+
// metric for alerting
|
|
218
|
+
logger.error(
|
|
219
|
+
`[BUILD_SCHEDULING_FAILURE] buildId=${buildId} elapsed_minutes=${elapsedMinutes} ` +
|
|
220
|
+
`timeout_minutes=${initTimeout} retry_count=${retryCount}`
|
|
221
|
+
);
|
|
222
|
+
|
|
223
|
+
try {
|
|
224
|
+
await helper.updateBuildStatusAsync(
|
|
225
|
+
buildConfig,
|
|
226
|
+
'FAILURE',
|
|
227
|
+
`Build failed to start within ${initTimeout} minutes (elapsed: ${elapsedMinutes} minutes). Pod was not scheduled - cluster may be out of capacity.`
|
|
228
|
+
);
|
|
229
|
+
logger.info(`Build ${buildId} marked as FAILURE due to pod scheduling timeout`);
|
|
230
|
+
} catch (err) {
|
|
231
|
+
logger.error(`Failed to update build status to FAILURE for build:${buildId}:${err}`);
|
|
232
|
+
}
|
|
233
|
+
channelWrapper.ack(data);
|
|
234
|
+
thread.kill();
|
|
235
|
+
|
|
236
|
+
return;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Timeout not exceeded - retry with delay
|
|
241
|
+
if (retryCount >= messageReprocessLimit) {
|
|
242
|
+
logger.error(
|
|
243
|
+
`Build ${buildId} max retries (${messageReprocessLimit}) exceeded while waiting for pod scheduling`
|
|
244
|
+
);
|
|
245
|
+
|
|
246
|
+
// metric for alerting
|
|
247
|
+
logger.error(
|
|
248
|
+
`[BUILD_SCHEDULING_FAILURE] buildId=${buildId} elapsed_minutes=` +
|
|
249
|
+
`${((Date.now() - buildStartTime) / 1000 / 60).toFixed(2)} max_retries=${retryCount}`
|
|
250
|
+
);
|
|
251
|
+
|
|
252
|
+
try {
|
|
253
|
+
await helper.updateBuildStatusAsync(
|
|
254
|
+
buildConfig,
|
|
255
|
+
'FAILURE',
|
|
256
|
+
'Build failed to start. Pod was not scheduled after maximum retries - cluster may be out of capacity.'
|
|
257
|
+
);
|
|
258
|
+
logger.info(`Build ${buildId} marked as FAILURE due to max retries`);
|
|
259
|
+
} catch (err) {
|
|
260
|
+
logger.error(`Failed to update build status to FAILURE for build:${buildId}:${err}`);
|
|
261
|
+
}
|
|
262
|
+
channelWrapper.ack(data);
|
|
263
|
+
} else {
|
|
264
|
+
const nextRetryCount = retryCount + 1;
|
|
265
|
+
|
|
266
|
+
logger.info(
|
|
267
|
+
`Build ${buildId} pod not scheduled, retrying ${nextRetryCount}/${messageReprocessLimit} in ${retryDelay}s`
|
|
268
|
+
);
|
|
269
|
+
channelWrapper.ack(data);
|
|
270
|
+
|
|
271
|
+
// Re-publish to retry queue with incremented retry count
|
|
272
|
+
retryQueueLib
|
|
273
|
+
.push(buildConfig, buildId, retryDelay * 1000, nextRetryCount, buildStartTime)
|
|
274
|
+
.catch(err => {
|
|
275
|
+
logger.error(`Failed to re-publish to retry queue for ${job}: ${err.message}`);
|
|
276
|
+
});
|
|
277
|
+
}
|
|
278
|
+
} else if (message === 'initializing') {
|
|
279
|
+
// Pod is initializing (pulling image) - use progressive backoff for large images
|
|
280
|
+
if (retryCount >= messageReprocessLimit) {
|
|
281
|
+
logger.error(
|
|
282
|
+
`Build ${buildId} max retries (${messageReprocessLimit}) exceeded while pod initializing/pulling image`
|
|
283
|
+
);
|
|
284
|
+
try {
|
|
285
|
+
await helper.updateBuildStatusAsync(
|
|
286
|
+
buildConfig,
|
|
287
|
+
'FAILURE',
|
|
288
|
+
'Build failed to start. Pod initialization timeout - pod may be stuck pulling a large image or container startup is slow.'
|
|
289
|
+
);
|
|
290
|
+
logger.info(`Build ${buildId} marked as FAILURE due to max retries during initialization`);
|
|
291
|
+
} catch (err) {
|
|
292
|
+
logger.error(`Failed to update build status to FAILURE for build:${buildId}:${err}`);
|
|
293
|
+
}
|
|
294
|
+
channelWrapper.ack(data);
|
|
295
|
+
} else {
|
|
296
|
+
const nextRetryCount = retryCount + 1;
|
|
297
|
+
|
|
298
|
+
const baseDelayMs = retryDelay * 1000;
|
|
299
|
+
const incrementMs = 10000 * retryCount;
|
|
300
|
+
const delayMs = baseDelayMs + incrementMs;
|
|
301
|
+
const delaySec = (delayMs / 1000).toFixed(0);
|
|
302
|
+
|
|
303
|
+
logger.info(
|
|
304
|
+
`Build ${buildId} pod still initializing/pulling image, retrying ${nextRetryCount}/${messageReprocessLimit} in ${delaySec}s (progressive backoff)`
|
|
305
|
+
);
|
|
306
|
+
channelWrapper.ack(data);
|
|
307
|
+
|
|
308
|
+
// Re-publish to retry queue with incremented retry count and progressive delay
|
|
309
|
+
retryQueueLib.push(buildConfig, buildId, delayMs, nextRetryCount, buildStartTime).catch(err => {
|
|
310
|
+
logger.error(`Failed to re-publish to retry queue for ${job}: ${err.message}`);
|
|
311
|
+
});
|
|
312
|
+
}
|
|
313
|
+
} else if (message && message !== '') {
|
|
314
|
+
// Pod has failed - update build status and ack
|
|
234
315
|
try {
|
|
235
316
|
await helper.updateBuildStatusAsync(buildConfig, 'FAILURE', message);
|
|
236
317
|
logger.info(`build status successfully updated for build ${buildId}`);
|
|
237
318
|
} catch (err) {
|
|
238
319
|
logger.error(`Failed to update build status to FAILURE for build:${buildId}:${err}`);
|
|
239
320
|
}
|
|
321
|
+
channelWrapper.ack(data);
|
|
322
|
+
} else {
|
|
323
|
+
// Empty string means pod is running successfully - ack
|
|
324
|
+
logger.info(`pod started successfully for ${job}, acknowledging`);
|
|
325
|
+
channelWrapper.ack(data);
|
|
240
326
|
}
|
|
241
|
-
channelWrapper.ack(data);
|
|
242
327
|
thread.kill();
|
|
243
328
|
})
|
|
244
329
|
.on('error', async error => {
|
|
@@ -288,7 +373,11 @@ const listen = async () => {
|
|
|
288
373
|
const queueFn = [channel.checkQueue(queue), channel.prefetch(prefetchCount), channel.consume(queue, onMessage)];
|
|
289
374
|
|
|
290
375
|
if (retryQueueEnabled) {
|
|
291
|
-
queueFn.push(
|
|
376
|
+
queueFn.push(
|
|
377
|
+
channel.checkQueue(retryQueue),
|
|
378
|
+
channel.bindQueue(retryQueue, exchange, retryQueue),
|
|
379
|
+
channel.consume(retryQueue, onRetryMessage)
|
|
380
|
+
);
|
|
292
381
|
}
|
|
293
382
|
|
|
294
383
|
return Promise.all(queueFn);
|