@platformatic/watt-extra 1.6.2 → 1.6.3-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +9 -9
- package/plugins/alerts.js +8 -9
- package/plugins/env.js +1 -0
- package/plugins/flamegraphs.js +145 -38
- package/plugins/health-signals.js +11 -8
- package/plugins/update.js +2 -2
- package/test/alerts.test.js +212 -4
- package/test/helper.js +3 -0
- package/test/trigger-flamegraphs.test.js +149 -49
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@platformatic/watt-extra",
|
|
3
|
-
"version": "1.6.
|
|
3
|
+
"version": "1.6.3-alpha.1",
|
|
4
4
|
"description": "The Platformatic runtime manager",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"scripts": {
|
|
@@ -19,10 +19,10 @@
|
|
|
19
19
|
},
|
|
20
20
|
"devDependencies": {
|
|
21
21
|
"@fastify/websocket": "^11.1.0",
|
|
22
|
-
"@platformatic/composer": "^3.
|
|
23
|
-
"@platformatic/next": "^3.
|
|
24
|
-
"@platformatic/node": "^3.
|
|
25
|
-
"@platformatic/service": "^3.
|
|
22
|
+
"@platformatic/composer": "^3.25.0",
|
|
23
|
+
"@platformatic/next": "^3.25.0",
|
|
24
|
+
"@platformatic/node": "^3.25.0",
|
|
25
|
+
"@platformatic/service": "^3.25.0",
|
|
26
26
|
"atomic-sleep": "^1.0.0",
|
|
27
27
|
"borp": "^0.21.0",
|
|
28
28
|
"eslint": "9",
|
|
@@ -30,16 +30,16 @@
|
|
|
30
30
|
"fastify-plugin": "^5.0.1",
|
|
31
31
|
"neostandard": "^0.12.0",
|
|
32
32
|
"next": "^16.0.0",
|
|
33
|
-
"platformatic": "^3.
|
|
33
|
+
"platformatic": "^3.25.0",
|
|
34
34
|
"pprof-format": "^2.1.0",
|
|
35
35
|
"why-is-node-running": "^2.3.0"
|
|
36
36
|
},
|
|
37
37
|
"dependencies": {
|
|
38
38
|
"@datadog/pprof": "^5.9.0",
|
|
39
39
|
"@fastify/error": "^4.2.0",
|
|
40
|
-
"@platformatic/foundation": "^3.
|
|
41
|
-
"@platformatic/runtime": "^3.
|
|
42
|
-
"@platformatic/wattpm-pprof-capture": "^3.
|
|
40
|
+
"@platformatic/foundation": "^3.25.0",
|
|
41
|
+
"@platformatic/runtime": "^3.25.0",
|
|
42
|
+
"@platformatic/wattpm-pprof-capture": "^3.25.0",
|
|
43
43
|
"avvio": "^9.1.0",
|
|
44
44
|
"chalk": "^4.1.2",
|
|
45
45
|
"commist": "^3.2.0",
|
package/plugins/alerts.js
CHANGED
|
@@ -3,9 +3,9 @@ import { request } from 'undici'
|
|
|
3
3
|
async function alerts (app, _opts) {
|
|
4
4
|
const healthCache = [] // It's OK to have this in memory, this is per-pod.
|
|
5
5
|
const podHealthWindow =
|
|
6
|
-
app.instanceConfig?.
|
|
6
|
+
app.instanceConfig?.scaler?.podHealthWindow || 60 * 1000
|
|
7
7
|
const alertRetentionWindow =
|
|
8
|
-
app.instanceConfig?.
|
|
8
|
+
app.instanceConfig?.scaler?.alertRetentionWindow || 10 * 1000
|
|
9
9
|
|
|
10
10
|
const lastServicesAlertTime = {}
|
|
11
11
|
|
|
@@ -40,6 +40,7 @@ async function alerts (app, _opts) {
|
|
|
40
40
|
}
|
|
41
41
|
|
|
42
42
|
const timestamp = Date.now()
|
|
43
|
+
const workerId = healthInfo.id
|
|
43
44
|
const serviceId = healthInfo.application
|
|
44
45
|
const healthWithTimestamp = { ...healthInfo, timestamp, service: serviceId }
|
|
45
46
|
delete healthWithTimestamp.healthConfig // we don't need to store this
|
|
@@ -111,14 +112,12 @@ async function alerts (app, _opts) {
|
|
|
111
112
|
|
|
112
113
|
const alert = await body.json()
|
|
113
114
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
})
|
|
119
|
-
} catch (err) {
|
|
115
|
+
app.sendFlamegraphs({
|
|
116
|
+
workerIds: [workerId],
|
|
117
|
+
alertId: alert.id
|
|
118
|
+
}).catch(err => {
|
|
120
119
|
app.log.error({ err }, 'Failed to send a flamegraph')
|
|
121
|
-
}
|
|
120
|
+
})
|
|
122
121
|
}
|
|
123
122
|
}
|
|
124
123
|
|
package/plugins/env.js
CHANGED
|
@@ -21,6 +21,7 @@ const schema = {
|
|
|
21
21
|
PLT_FLAMEGRAPHS_INTERVAL_SEC: { type: 'number', default: 60 },
|
|
22
22
|
PLT_FLAMEGRAPHS_ELU_THRESHOLD: { type: 'number', default: 0.4 },
|
|
23
23
|
PLT_FLAMEGRAPHS_GRACE_PERIOD: { type: 'number', default: 3000 },
|
|
24
|
+
PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT: { type: 'number', default: 10000 },
|
|
24
25
|
PLT_JWT_EXPIRATION_OFFSET_SEC: { type: 'number', default: 60 },
|
|
25
26
|
PLT_UPDATES_RECONNECT_INTERVAL_SEC: { type: 'number', default: 1 },
|
|
26
27
|
PLT_ELU_HEALTH_SIGNAL_THRESHOLD: { type: 'number', default: 0.8 },
|
package/plugins/flamegraphs.js
CHANGED
|
@@ -8,10 +8,13 @@ async function flamegraphs (app, _opts) {
|
|
|
8
8
|
const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
|
|
9
9
|
const flamegraphsELUThreshold = app.env.PLT_FLAMEGRAPHS_ELU_THRESHOLD
|
|
10
10
|
const flamegraphsGracePeriod = app.env.PLT_FLAMEGRAPHS_GRACE_PERIOD
|
|
11
|
+
const flamegraphsAttemptTimeout = app.env.PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT
|
|
11
12
|
|
|
12
13
|
const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
|
|
13
14
|
const eluThreshold = parseFloat(flamegraphsELUThreshold)
|
|
14
15
|
const gracePeriod = parseInt(flamegraphsGracePeriod)
|
|
16
|
+
const attemptTimeout = Math.min(parseInt(flamegraphsAttemptTimeout), durationMillis)
|
|
17
|
+
const maxAttempts = Math.ceil(durationMillis / attemptTimeout) + 1
|
|
15
18
|
|
|
16
19
|
let workerStartedListener = null
|
|
17
20
|
|
|
@@ -125,13 +128,15 @@ async function flamegraphs (app, _opts) {
|
|
|
125
128
|
}
|
|
126
129
|
}
|
|
127
130
|
|
|
131
|
+
const profilesByWorkerId = {}
|
|
132
|
+
|
|
128
133
|
app.sendFlamegraphs = async (options = {}) => {
|
|
129
134
|
if (isFlamegraphsDisabled) {
|
|
130
135
|
app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
|
|
131
136
|
return
|
|
132
137
|
}
|
|
133
138
|
|
|
134
|
-
let {
|
|
139
|
+
let { workerIds, alertId, profileType = 'cpu' } = options
|
|
135
140
|
|
|
136
141
|
const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
|
|
137
142
|
if (!scalerUrl) {
|
|
@@ -139,61 +144,163 @@ async function flamegraphs (app, _opts) {
|
|
|
139
144
|
throw new Error('No scaler URL found in ICC services, cannot send flamegraph')
|
|
140
145
|
}
|
|
141
146
|
|
|
142
|
-
const podId = app.instanceId
|
|
143
147
|
const runtime = app.watt.runtime
|
|
144
148
|
|
|
145
|
-
if (!
|
|
149
|
+
if (!workerIds) {
|
|
146
150
|
const { applications } = await runtime.getApplications()
|
|
147
|
-
|
|
151
|
+
workerIds = applications.map(app => app.id)
|
|
148
152
|
}
|
|
149
153
|
|
|
150
|
-
|
|
154
|
+
cleanupFlamegraphsCache()
|
|
151
155
|
|
|
152
|
-
const uploadPromises =
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
156
|
+
const uploadPromises = workerIds.map(async (workerId) => {
|
|
157
|
+
let profile = profilesByWorkerId[workerId]
|
|
158
|
+
if (profile?.flamegraphId) {
|
|
159
|
+
const { flamegraphId } = profile
|
|
160
|
+
try {
|
|
161
|
+
await attachFlamegraphToAlerts(scalerUrl, flamegraphId, [alertId])
|
|
157
162
|
return
|
|
163
|
+
} catch (err) {
|
|
164
|
+
if (err.code === 'PLT_ATTACH_FLAMEGRAPH_MULTIPLE_ALERTS_NOT_SUPPORTED') {
|
|
165
|
+
app.log.warn(
|
|
166
|
+
'Attaching flamegraph multiple alerts is not supported by the scaler.' +
|
|
167
|
+
' Please upgrade to the latest ICC version to use this feature.'
|
|
168
|
+
)
|
|
169
|
+
} else {
|
|
170
|
+
app.log.error({ err, workerId, alertId, flamegraphId }, 'Failed to attach flamegraph to alert')
|
|
171
|
+
}
|
|
158
172
|
}
|
|
173
|
+
}
|
|
159
174
|
|
|
160
|
-
|
|
175
|
+
if (!profile) {
|
|
176
|
+
profile = await getServiceFlamegraph(workerId, profileType)
|
|
177
|
+
if (!profile || !(profile.data instanceof Uint8Array)) {
|
|
178
|
+
app.log.error({ workerId }, 'Failed to get profile from service')
|
|
179
|
+
return
|
|
180
|
+
}
|
|
181
|
+
}
|
|
161
182
|
|
|
162
|
-
|
|
183
|
+
profilesByWorkerId[workerId] = profile
|
|
163
184
|
|
|
164
|
-
|
|
165
|
-
if (alertId) {
|
|
166
|
-
query.alertId = alertId
|
|
167
|
-
}
|
|
185
|
+
const serviceId = workerId.split(':')[0]
|
|
168
186
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
if (statusCode !== 200) {
|
|
180
|
-
const error = await body.text()
|
|
181
|
-
app.log.error({ error }, 'Failed to send flamegraph')
|
|
182
|
-
throw new Error(`Failed to send flamegraph: ${error}`)
|
|
183
|
-
}
|
|
187
|
+
try {
|
|
188
|
+
const flamegraph = await sendServiceFlamegraph(
|
|
189
|
+
scalerUrl,
|
|
190
|
+
serviceId,
|
|
191
|
+
profile.data,
|
|
192
|
+
profileType,
|
|
193
|
+
alertId
|
|
194
|
+
)
|
|
195
|
+
profile.flamegraphId = flamegraph.id
|
|
184
196
|
} catch (err) {
|
|
185
|
-
|
|
186
|
-
app.log.info({ serviceId, podId }, 'No profile available for the service')
|
|
187
|
-
} else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
|
|
188
|
-
app.log.info({ serviceId, podId }, 'ELU low, CPU profiling not active')
|
|
189
|
-
} else {
|
|
190
|
-
app.log.warn({ err, serviceId, podId }, 'Failed to send flamegraph from service')
|
|
191
|
-
}
|
|
197
|
+
app.log.error({ err, workerId, alertId, profileType }, 'Failed to send flamegraph')
|
|
192
198
|
}
|
|
193
199
|
})
|
|
194
200
|
|
|
195
201
|
await Promise.all(uploadPromises)
|
|
196
202
|
}
|
|
203
|
+
|
|
204
|
+
async function getServiceFlamegraph (workerId, profileType, attempt = 1) {
|
|
205
|
+
const runtime = app.watt.runtime
|
|
206
|
+
|
|
207
|
+
try {
|
|
208
|
+
const [state, profile] = await Promise.all([
|
|
209
|
+
runtime.sendCommandToApplication(workerId, 'getProfilingState', { type: profileType }),
|
|
210
|
+
runtime.sendCommandToApplication(workerId, 'getLastProfile', { type: profileType })
|
|
211
|
+
])
|
|
212
|
+
return { data: profile, timestamp: state.latestProfileTimestamp }
|
|
213
|
+
} catch (err) {
|
|
214
|
+
if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
|
|
215
|
+
app.log.info(
|
|
216
|
+
{ workerId, attempt, maxAttempts, attemptTimeout },
|
|
217
|
+
'No profile available for the service. Waiting for profiling to complete.'
|
|
218
|
+
)
|
|
219
|
+
if (attempt <= maxAttempts) {
|
|
220
|
+
await sleep(attemptTimeout)
|
|
221
|
+
return getServiceFlamegraph(workerId, profileType, attempt + 1)
|
|
222
|
+
}
|
|
223
|
+
} else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
|
|
224
|
+
app.log.info({ workerId }, 'ELU low, CPU profiling not active')
|
|
225
|
+
} else {
|
|
226
|
+
app.log.warn({ err, workerId }, 'Failed to get profile from a worker')
|
|
227
|
+
|
|
228
|
+
const [serviceId, workerIndex] = workerId.split(':')
|
|
229
|
+
if (workerIndex) {
|
|
230
|
+
app.log.warn('Worker not available, trying to get profile from another worker')
|
|
231
|
+
return getServiceFlamegraph(serviceId, profileType)
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
async function sendServiceFlamegraph (scalerUrl, serviceId, profile, profileType, alertId) {
|
|
238
|
+
const podId = app.instanceId
|
|
239
|
+
const url = `${scalerUrl}/pods/${podId}/services/${serviceId}/flamegraph`
|
|
240
|
+
app.log.info({ serviceId, podId, profileType }, 'Sending flamegraph')
|
|
241
|
+
|
|
242
|
+
const query = { profileType }
|
|
243
|
+
if (alertId) {
|
|
244
|
+
query.alertId = alertId
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
const authHeaders = await app.getAuthorizationHeader()
|
|
248
|
+
const { statusCode, body } = await request(url, {
|
|
249
|
+
method: 'POST',
|
|
250
|
+
headers: {
|
|
251
|
+
'Content-Type': 'application/octet-stream',
|
|
252
|
+
...authHeaders
|
|
253
|
+
},
|
|
254
|
+
query,
|
|
255
|
+
body: profile
|
|
256
|
+
})
|
|
257
|
+
|
|
258
|
+
if (statusCode !== 200) {
|
|
259
|
+
const error = await body.text()
|
|
260
|
+
app.log.error({ error }, 'Failed to send flamegraph')
|
|
261
|
+
throw new Error(`Failed to send flamegraph: ${error}`)
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
const response = await body.json()
|
|
265
|
+
return response
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
async function attachFlamegraphToAlerts (scalerUrl, flamegraphId, alertIds) {
|
|
269
|
+
const url = `${scalerUrl}/flamegraphs/${flamegraphId}/alerts`
|
|
270
|
+
app.log.info({ flamegraphId, alerts: alertIds }, 'Attaching flamegraph to alerts')
|
|
271
|
+
|
|
272
|
+
const authHeaders = await app.getAuthorizationHeader()
|
|
273
|
+
const { statusCode, body } = await request(url, {
|
|
274
|
+
method: 'POST',
|
|
275
|
+
headers: {
|
|
276
|
+
'Content-Type': 'application/json',
|
|
277
|
+
...authHeaders
|
|
278
|
+
},
|
|
279
|
+
body: JSON.stringify({ alertIds })
|
|
280
|
+
})
|
|
281
|
+
|
|
282
|
+
if (statusCode !== 200) {
|
|
283
|
+
const error = await body.text()
|
|
284
|
+
if (statusCode === 404 && error.includes('Route POST')) {
|
|
285
|
+
const err = new Error('Attaching flamegraph multiple alerts is not supported by the scaler')
|
|
286
|
+
err.code = 'PLT_ATTACH_FLAMEGRAPH_MULTIPLE_ALERTS_NOT_SUPPORTED'
|
|
287
|
+
throw err
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
throw new Error(`Failed to attach flamegraph to alerts: ${error}`)
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
function cleanupFlamegraphsCache () {
|
|
295
|
+
const now = Date.now()
|
|
296
|
+
|
|
297
|
+
for (const workerId of Object.keys(profilesByWorkerId)) {
|
|
298
|
+
const { timestamp } = profilesByWorkerId[workerId]
|
|
299
|
+
if (now - timestamp > durationMillis) {
|
|
300
|
+
delete profilesByWorkerId[workerId]
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
}
|
|
197
304
|
}
|
|
198
305
|
|
|
199
306
|
export default flamegraphs
|
|
@@ -80,6 +80,7 @@ async function healthSignals (app, _opts) {
|
|
|
80
80
|
}
|
|
81
81
|
|
|
82
82
|
const {
|
|
83
|
+
id: workerId,
|
|
83
84
|
application: serviceId,
|
|
84
85
|
currentHealth,
|
|
85
86
|
healthSignals
|
|
@@ -125,13 +126,13 @@ async function healthSignals (app, _opts) {
|
|
|
125
126
|
}
|
|
126
127
|
|
|
127
128
|
if (healthSignals.length > 0) {
|
|
128
|
-
await sendHealthSignalsWithTimeout(serviceId, healthSignals)
|
|
129
|
+
await sendHealthSignalsWithTimeout(serviceId, workerId, healthSignals)
|
|
129
130
|
}
|
|
130
131
|
})
|
|
131
132
|
}
|
|
132
133
|
app.setupHealthSignals = setupHealthSignals
|
|
133
134
|
|
|
134
|
-
async function sendHealthSignalsWithTimeout (serviceId, signals) {
|
|
135
|
+
async function sendHealthSignalsWithTimeout (serviceId, workerId, signals) {
|
|
135
136
|
signalsCaches[serviceId] ??= new HealthSignalsCache()
|
|
136
137
|
servicesSendingStatuses[serviceId] ??= false
|
|
137
138
|
|
|
@@ -148,7 +149,7 @@ async function healthSignals (app, _opts) {
|
|
|
148
149
|
|
|
149
150
|
try {
|
|
150
151
|
const signals = signalsCache.getAll()
|
|
151
|
-
await sendHealthSignals(serviceId, signals, metrics)
|
|
152
|
+
await sendHealthSignals(serviceId, workerId, signals, metrics)
|
|
152
153
|
} catch (err) {
|
|
153
154
|
app.log.error({ err }, 'Failed to send health signals to scaler')
|
|
154
155
|
}
|
|
@@ -156,7 +157,7 @@ async function healthSignals (app, _opts) {
|
|
|
156
157
|
}
|
|
157
158
|
}
|
|
158
159
|
|
|
159
|
-
async function sendHealthSignals (serviceId, signals, metrics) {
|
|
160
|
+
async function sendHealthSignals (serviceId, workerId, signals, metrics) {
|
|
160
161
|
const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
|
|
161
162
|
const applicationId = app.instanceConfig?.applicationId
|
|
162
163
|
const authHeaders = await app.getAuthorizationHeader()
|
|
@@ -184,11 +185,13 @@ async function healthSignals (app, _opts) {
|
|
|
184
185
|
|
|
185
186
|
const alert = await body.json()
|
|
186
187
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
188
|
+
app.sendFlamegraphs({
|
|
189
|
+
serviceIds: [serviceId],
|
|
190
|
+
workerIds: [workerId],
|
|
191
|
+
alertId: alert.id
|
|
192
|
+
}).catch(err => {
|
|
190
193
|
app.log.error({ err }, 'Failed to send a flamegraph')
|
|
191
|
-
}
|
|
194
|
+
})
|
|
192
195
|
}
|
|
193
196
|
}
|
|
194
197
|
|
package/plugins/update.js
CHANGED
|
@@ -23,14 +23,14 @@ async function updatePlugin (app) {
|
|
|
23
23
|
// Handle trigger-flamegraph command from ICC
|
|
24
24
|
if (command === 'trigger-flamegraph') {
|
|
25
25
|
app.log.info({ command }, 'Received trigger-flamegraph command from ICC')
|
|
26
|
-
|
|
26
|
+
app.sendFlamegraphs({ profileType: 'cpu' })
|
|
27
27
|
return
|
|
28
28
|
}
|
|
29
29
|
|
|
30
30
|
// Handle trigger-heapprofile command from ICC
|
|
31
31
|
if (command === 'trigger-heapprofile') {
|
|
32
32
|
app.log.info({ command }, 'Received trigger-heapprofile command from ICC')
|
|
33
|
-
|
|
33
|
+
app.sendFlamegraphs({ profileType: 'heap' })
|
|
34
34
|
return
|
|
35
35
|
}
|
|
36
36
|
|
package/test/alerts.test.js
CHANGED
|
@@ -378,10 +378,8 @@ test('should respect alert retention window', async (t) => {
|
|
|
378
378
|
const icc = await startICC(t, {
|
|
379
379
|
applicationId,
|
|
380
380
|
applicationName,
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
alertRetentionWindow: 500
|
|
384
|
-
}
|
|
381
|
+
scaler: {
|
|
382
|
+
alertRetentionWindow: 500
|
|
385
383
|
},
|
|
386
384
|
processAlerts: (req) => {
|
|
387
385
|
const alert = req.body
|
|
@@ -740,3 +738,213 @@ test('should handle old runtime (< 3.18.0) health events', async (t) => {
|
|
|
740
738
|
assert.deepStrictEqual(alertReceived.alert.currentHealth, healthInfo.currentHealth)
|
|
741
739
|
assert.strictEqual(alertReceived.alert.healthConfig, undefined, 'healthConfig should be deleted from alert')
|
|
742
740
|
})
|
|
741
|
+
|
|
742
|
+
test('should attach one flamegraph to multiple alerts', async (t) => {
|
|
743
|
+
const applicationName = 'test-app'
|
|
744
|
+
const applicationId = randomUUID()
|
|
745
|
+
const applicationPath = join(__dirname, 'fixtures', 'service-1')
|
|
746
|
+
|
|
747
|
+
const receivedAlerts = []
|
|
748
|
+
const receivedFlamegraphs = []
|
|
749
|
+
const receivedAttachedFlamegraphs = []
|
|
750
|
+
|
|
751
|
+
const getAuthorizationHeader = async (headers) => {
|
|
752
|
+
return { ...headers, authorization: 'Bearer test-token' }
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
const icc = await startICC(t, {
|
|
756
|
+
applicationId,
|
|
757
|
+
applicationName,
|
|
758
|
+
scaler: {
|
|
759
|
+
podHealthWindow: 1,
|
|
760
|
+
alertRetentionWindow: 1
|
|
761
|
+
},
|
|
762
|
+
processAlerts: (req) => {
|
|
763
|
+
assert.equal(req.headers.authorization, 'Bearer test-token')
|
|
764
|
+
const alert = req.body
|
|
765
|
+
alert.id = `alert-${receivedAlerts.length + 1}`
|
|
766
|
+
receivedAlerts.push(alert)
|
|
767
|
+
return alert
|
|
768
|
+
},
|
|
769
|
+
processFlamegraphs: (req) => {
|
|
770
|
+
assert.strictEqual(req.headers.authorization, 'Bearer test-token')
|
|
771
|
+
const flamegraphId = `flamegraph-${receivedFlamegraphs.length + 1}`
|
|
772
|
+
const alertId = req.query.alertId
|
|
773
|
+
receivedFlamegraphs.push({ id: flamegraphId, alertId })
|
|
774
|
+
return { id: flamegraphId }
|
|
775
|
+
},
|
|
776
|
+
attachFlamegraphToAlerts: (req) => {
|
|
777
|
+
assert.strictEqual(req.headers.authorization, 'Bearer test-token')
|
|
778
|
+
const flamegraphId = req.params.flamegraphId
|
|
779
|
+
const { alertIds } = req.body
|
|
780
|
+
receivedAttachedFlamegraphs.push({ flamegraphId, alertIds })
|
|
781
|
+
return {}
|
|
782
|
+
}
|
|
783
|
+
})
|
|
784
|
+
|
|
785
|
+
setUpEnvironment({
|
|
786
|
+
PLT_APP_NAME: applicationName,
|
|
787
|
+
PLT_APP_DIR: applicationPath,
|
|
788
|
+
PLT_ICC_URL: 'http://127.0.0.1:3000',
|
|
789
|
+
PLT_DISABLE_FLAMEGRAPHS: false,
|
|
790
|
+
PLT_FLAMEGRAPHS_INTERVAL_SEC: 5,
|
|
791
|
+
PLT_FLAMEGRAPHS_ELU_THRESHOLD: 0
|
|
792
|
+
})
|
|
793
|
+
|
|
794
|
+
const app = await start()
|
|
795
|
+
app.getAuthorizationHeader = getAuthorizationHeader
|
|
796
|
+
|
|
797
|
+
t.after(async () => {
|
|
798
|
+
await app.close()
|
|
799
|
+
await icc.close()
|
|
800
|
+
})
|
|
801
|
+
|
|
802
|
+
// Wait for the first flamegraph to be generated
|
|
803
|
+
await sleep(5000)
|
|
804
|
+
|
|
805
|
+
// Manually trigger health event with unhealthy state
|
|
806
|
+
const healthInfo = {
|
|
807
|
+
id: 'main:0',
|
|
808
|
+
application: 'main',
|
|
809
|
+
currentHealth: {
|
|
810
|
+
elu: 0.995,
|
|
811
|
+
heapUsed: 76798040,
|
|
812
|
+
heapTotal: 99721216
|
|
813
|
+
},
|
|
814
|
+
unhealthy: true,
|
|
815
|
+
healthConfig: {
|
|
816
|
+
enabled: true,
|
|
817
|
+
interval: 1000,
|
|
818
|
+
gracePeriod: 1000,
|
|
819
|
+
maxUnhealthyChecks: 10,
|
|
820
|
+
maxELU: 0.99,
|
|
821
|
+
maxHeapUsed: 0.99,
|
|
822
|
+
maxHeapTotal: 4294967296
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
emitHealthEvent(app, healthInfo)
|
|
827
|
+
await sleep(1000)
|
|
828
|
+
emitHealthEvent(app, healthInfo)
|
|
829
|
+
|
|
830
|
+
// Wait for flamegraphs to be sent
|
|
831
|
+
await sleep(1000)
|
|
832
|
+
|
|
833
|
+
assert.strictEqual(receivedAlerts.length, 2)
|
|
834
|
+
const alert1 = receivedAlerts[0]
|
|
835
|
+
const alert2 = receivedAlerts[1]
|
|
836
|
+
assert.strictEqual(alert1.id, 'alert-1')
|
|
837
|
+
assert.strictEqual(alert2.id, 'alert-2')
|
|
838
|
+
|
|
839
|
+
assert.strictEqual(receivedFlamegraphs.length, 1)
|
|
840
|
+
const flamegraph = receivedFlamegraphs[0]
|
|
841
|
+
assert.strictEqual(flamegraph.id, 'flamegraph-1')
|
|
842
|
+
assert.strictEqual(flamegraph.alertId, 'alert-1')
|
|
843
|
+
|
|
844
|
+
assert.strictEqual(receivedAttachedFlamegraphs.length, 1)
|
|
845
|
+
const attachedFlamegraph = receivedAttachedFlamegraphs[0]
|
|
846
|
+
assert.strictEqual(attachedFlamegraph.flamegraphId, 'flamegraph-1')
|
|
847
|
+
assert.deepStrictEqual(attachedFlamegraph.alertIds, ['alert-2'])
|
|
848
|
+
})
|
|
849
|
+
|
|
850
|
+
test('should send flamegraphs if attaching fails', async (t) => {
|
|
851
|
+
const applicationName = 'test-app'
|
|
852
|
+
const applicationId = randomUUID()
|
|
853
|
+
const applicationPath = join(__dirname, 'fixtures', 'service-1')
|
|
854
|
+
|
|
855
|
+
const receivedAlerts = []
|
|
856
|
+
const receivedFlamegraphs = []
|
|
857
|
+
|
|
858
|
+
const getAuthorizationHeader = async (headers) => {
|
|
859
|
+
return { ...headers, authorization: 'Bearer test-token' }
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
const icc = await startICC(t, {
|
|
863
|
+
applicationId,
|
|
864
|
+
applicationName,
|
|
865
|
+
scaler: {
|
|
866
|
+
podHealthWindow: 1,
|
|
867
|
+
alertRetentionWindow: 1
|
|
868
|
+
},
|
|
869
|
+
processAlerts: (req) => {
|
|
870
|
+
assert.equal(req.headers.authorization, 'Bearer test-token')
|
|
871
|
+
const alert = req.body
|
|
872
|
+
alert.id = `alert-${receivedAlerts.length + 1}`
|
|
873
|
+
receivedAlerts.push(alert)
|
|
874
|
+
return alert
|
|
875
|
+
},
|
|
876
|
+
processFlamegraphs: (req) => {
|
|
877
|
+
assert.strictEqual(req.headers.authorization, 'Bearer test-token')
|
|
878
|
+
const flamegraphId = `flamegraph-${receivedFlamegraphs.length + 1}`
|
|
879
|
+
const alertId = req.query.alertId
|
|
880
|
+
receivedFlamegraphs.push({ id: flamegraphId, alertId })
|
|
881
|
+
return { id: flamegraphId }
|
|
882
|
+
},
|
|
883
|
+
attachFlamegraphToAlerts: (req) => {
|
|
884
|
+
throw new Error('Failed to attach flamegraph')
|
|
885
|
+
}
|
|
886
|
+
})
|
|
887
|
+
|
|
888
|
+
setUpEnvironment({
|
|
889
|
+
PLT_APP_NAME: applicationName,
|
|
890
|
+
PLT_APP_DIR: applicationPath,
|
|
891
|
+
PLT_ICC_URL: 'http://127.0.0.1:3000',
|
|
892
|
+
PLT_DISABLE_FLAMEGRAPHS: false,
|
|
893
|
+
PLT_FLAMEGRAPHS_INTERVAL_SEC: 5,
|
|
894
|
+
PLT_FLAMEGRAPHS_ELU_THRESHOLD: 0
|
|
895
|
+
})
|
|
896
|
+
|
|
897
|
+
const app = await start()
|
|
898
|
+
app.getAuthorizationHeader = getAuthorizationHeader
|
|
899
|
+
|
|
900
|
+
t.after(async () => {
|
|
901
|
+
await app.close()
|
|
902
|
+
await icc.close()
|
|
903
|
+
})
|
|
904
|
+
|
|
905
|
+
// Wait for the first flamegraph to be generated
|
|
906
|
+
await sleep(5000)
|
|
907
|
+
|
|
908
|
+
// Manually trigger health event with unhealthy state
|
|
909
|
+
const healthInfo = {
|
|
910
|
+
id: 'main:0',
|
|
911
|
+
application: 'main',
|
|
912
|
+
currentHealth: {
|
|
913
|
+
elu: 0.995,
|
|
914
|
+
heapUsed: 76798040,
|
|
915
|
+
heapTotal: 99721216
|
|
916
|
+
},
|
|
917
|
+
unhealthy: true,
|
|
918
|
+
healthConfig: {
|
|
919
|
+
enabled: true,
|
|
920
|
+
interval: 1000,
|
|
921
|
+
gracePeriod: 1000,
|
|
922
|
+
maxUnhealthyChecks: 10,
|
|
923
|
+
maxELU: 0.99,
|
|
924
|
+
maxHeapUsed: 0.99,
|
|
925
|
+
maxHeapTotal: 4294967296
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
emitHealthEvent(app, healthInfo)
|
|
930
|
+
await sleep(1000)
|
|
931
|
+
emitHealthEvent(app, healthInfo)
|
|
932
|
+
|
|
933
|
+
// Wait for flamegraphs to be sent
|
|
934
|
+
await sleep(1000)
|
|
935
|
+
|
|
936
|
+
assert.strictEqual(receivedAlerts.length, 2)
|
|
937
|
+
const alert1 = receivedAlerts[0]
|
|
938
|
+
const alert2 = receivedAlerts[1]
|
|
939
|
+
assert.strictEqual(alert1.id, 'alert-1')
|
|
940
|
+
assert.strictEqual(alert2.id, 'alert-2')
|
|
941
|
+
|
|
942
|
+
assert.strictEqual(receivedFlamegraphs.length, 2)
|
|
943
|
+
const flamegraph1 = receivedFlamegraphs[0]
|
|
944
|
+
assert.strictEqual(flamegraph1.id, 'flamegraph-1')
|
|
945
|
+
assert.strictEqual(flamegraph1.alertId, 'alert-1')
|
|
946
|
+
|
|
947
|
+
const flamegraph2 = receivedFlamegraphs[1]
|
|
948
|
+
assert.strictEqual(flamegraph2.id, 'flamegraph-2')
|
|
949
|
+
assert.strictEqual(flamegraph2.alertId, 'alert-2')
|
|
950
|
+
})
|
package/test/helper.js
CHANGED
|
@@ -199,6 +199,9 @@ async function startICC (t, opts = {}) {
|
|
|
199
199
|
icc.post('/pods/:podId/services/:serviceId/flamegraph', async (req) => {
|
|
200
200
|
return opts.processFlamegraphs?.(req)
|
|
201
201
|
})
|
|
202
|
+
icc.post('/flamegraphs/:flamegraphId/alerts', async (req) => {
|
|
203
|
+
return opts.attachFlamegraphToAlerts?.(req)
|
|
204
|
+
})
|
|
202
205
|
}, { prefix: '/scaler' })
|
|
203
206
|
|
|
204
207
|
// Cron
|
|
@@ -35,7 +35,7 @@ function setupMockIccServer (wss, receivedMessages, validateAuth = false) {
|
|
|
35
35
|
return { waitForClientSubscription, getWs: () => ws }
|
|
36
36
|
}
|
|
37
37
|
|
|
38
|
-
function createMockApp (port, includeScalerUrl = true) {
|
|
38
|
+
function createMockApp (port, includeScalerUrl = true, env = {}) {
|
|
39
39
|
const eventListeners = new Map()
|
|
40
40
|
|
|
41
41
|
const mockWatt = {
|
|
@@ -100,7 +100,9 @@ function createMockApp (port, includeScalerUrl = true) {
|
|
|
100
100
|
PLT_DISABLE_FLAMEGRAPHS: false,
|
|
101
101
|
PLT_FLAMEGRAPHS_INTERVAL_SEC: 1,
|
|
102
102
|
PLT_FLAMEGRAPHS_ELU_THRESHOLD: 0,
|
|
103
|
-
PLT_FLAMEGRAPHS_GRACE_PERIOD: 0
|
|
103
|
+
PLT_FLAMEGRAPHS_GRACE_PERIOD: 0,
|
|
104
|
+
PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT: 1000,
|
|
105
|
+
...env
|
|
104
106
|
},
|
|
105
107
|
watt: mockWatt
|
|
106
108
|
}
|
|
@@ -333,15 +335,15 @@ test('sendFlamegraphs should handle missing profile data', async (t) => {
|
|
|
333
335
|
equal(errors.length, 2, 'Should log errors for both services with missing profiles')
|
|
334
336
|
})
|
|
335
337
|
|
|
336
|
-
test('sendFlamegraphs should filter by
|
|
338
|
+
test('sendFlamegraphs should filter by workerIds when provided', async (t) => {
|
|
337
339
|
setUpEnvironment()
|
|
338
340
|
|
|
339
341
|
const app = createMockApp(port + 12)
|
|
340
342
|
const getProfileCalls = []
|
|
341
343
|
|
|
342
|
-
app.watt.runtime.sendCommandToApplication = async (
|
|
344
|
+
app.watt.runtime.sendCommandToApplication = async (workerId, command) => {
|
|
343
345
|
if (command === 'getLastProfile') {
|
|
344
|
-
getProfileCalls.push(
|
|
346
|
+
getProfileCalls.push(workerId)
|
|
345
347
|
return new Uint8Array([1, 2, 3])
|
|
346
348
|
}
|
|
347
349
|
return { success: false }
|
|
@@ -362,10 +364,49 @@ test('sendFlamegraphs should filter by serviceIds when provided', async (t) => {
|
|
|
362
364
|
t.after(() => server.close())
|
|
363
365
|
|
|
364
366
|
await flamegraphsPlugin(app)
|
|
365
|
-
await app.sendFlamegraphs({
|
|
367
|
+
await app.sendFlamegraphs({ workerIds: ['service-1:0'] })
|
|
366
368
|
|
|
367
369
|
equal(getProfileCalls.length, 1, 'Should only request profile for specified service')
|
|
368
|
-
equal(getProfileCalls[0], 'service-1', 'Should request profile for service-1')
|
|
370
|
+
equal(getProfileCalls[0], 'service-1:0', 'Should request profile for service-1')
|
|
371
|
+
})
|
|
372
|
+
|
|
373
|
+
test('sendFlamegraphs should try to get the profile from a service if worker is not available', async (t) => {
|
|
374
|
+
setUpEnvironment()
|
|
375
|
+
|
|
376
|
+
const app = createMockApp(port + 12)
|
|
377
|
+
const getProfileCalls = []
|
|
378
|
+
|
|
379
|
+
app.watt.runtime.sendCommandToApplication = async (workerId, command) => {
|
|
380
|
+
if (command === 'getLastProfile') {
|
|
381
|
+
getProfileCalls.push(workerId)
|
|
382
|
+
if (workerId === 'service-1:2') {
|
|
383
|
+
throw new Error('Worker not available')
|
|
384
|
+
}
|
|
385
|
+
return new Uint8Array([1, 2, 3])
|
|
386
|
+
}
|
|
387
|
+
return { success: false }
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// Mock HTTP server
|
|
391
|
+
const { createServer } = await import('node:http')
|
|
392
|
+
const server = createServer((req, res) => {
|
|
393
|
+
const body = []
|
|
394
|
+
req.on('data', chunk => body.push(chunk))
|
|
395
|
+
req.on('end', () => {
|
|
396
|
+
res.writeHead(200)
|
|
397
|
+
res.end()
|
|
398
|
+
})
|
|
399
|
+
})
|
|
400
|
+
|
|
401
|
+
await new Promise(resolve => server.listen(port + 12, resolve))
|
|
402
|
+
t.after(() => server.close())
|
|
403
|
+
|
|
404
|
+
await flamegraphsPlugin(app)
|
|
405
|
+
await app.sendFlamegraphs({ workerIds: ['service-1:2'] })
|
|
406
|
+
|
|
407
|
+
equal(getProfileCalls.length, 2)
|
|
408
|
+
equal(getProfileCalls[0], 'service-1:2')
|
|
409
|
+
equal(getProfileCalls[1], 'service-1')
|
|
369
410
|
})
|
|
370
411
|
|
|
371
412
|
test('sendFlamegraphs should skip when PLT_DISABLE_FLAMEGRAPHS is set', async (t) => {
|
|
@@ -376,9 +417,9 @@ test('sendFlamegraphs should skip when PLT_DISABLE_FLAMEGRAPHS is set', async (t
|
|
|
376
417
|
|
|
377
418
|
const getProfileCalls = []
|
|
378
419
|
|
|
379
|
-
app.watt.runtime.sendCommandToApplication = async (
|
|
420
|
+
app.watt.runtime.sendCommandToApplication = async (workerId, command) => {
|
|
380
421
|
if (command === 'getLastProfile') {
|
|
381
|
-
getProfileCalls.push(
|
|
422
|
+
getProfileCalls.push(workerId)
|
|
382
423
|
return new Uint8Array([1, 2, 3])
|
|
383
424
|
}
|
|
384
425
|
return { success: false }
|
|
@@ -452,6 +493,13 @@ test('should handle trigger-flamegraph command and upload flamegraphs from servi
|
|
|
452
493
|
await app.connectToUpdates()
|
|
453
494
|
await app.setupFlamegraphs()
|
|
454
495
|
|
|
496
|
+
t.after(async () => {
|
|
497
|
+
if (app.cleanupFlamegraphs) {
|
|
498
|
+
app.cleanupFlamegraphs()
|
|
499
|
+
}
|
|
500
|
+
await app.closeUpdates()
|
|
501
|
+
})
|
|
502
|
+
|
|
455
503
|
await waitForClientSubscription
|
|
456
504
|
|
|
457
505
|
const triggerFlamegraphMessage = {
|
|
@@ -473,9 +521,6 @@ test('should handle trigger-flamegraph command and upload flamegraphs from servi
|
|
|
473
521
|
|
|
474
522
|
equal(service1Req.serviceId, 'service-1')
|
|
475
523
|
equal(service2Req.serviceId, 'service-2')
|
|
476
|
-
|
|
477
|
-
if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
|
|
478
|
-
await app.closeUpdates()
|
|
479
524
|
})
|
|
480
525
|
|
|
481
526
|
test('should handle trigger-flamegraph when no runtime is available', async (t) => {
|
|
@@ -609,6 +654,13 @@ test('should handle trigger-heapprofile command and upload heap profiles from se
|
|
|
609
654
|
await app.connectToUpdates()
|
|
610
655
|
await app.setupFlamegraphs()
|
|
611
656
|
|
|
657
|
+
t.after(async () => {
|
|
658
|
+
if (app.cleanupFlamegraphs) {
|
|
659
|
+
app.cleanupFlamegraphs()
|
|
660
|
+
}
|
|
661
|
+
await app.closeUpdates()
|
|
662
|
+
})
|
|
663
|
+
|
|
612
664
|
await waitForClientSubscription
|
|
613
665
|
|
|
614
666
|
const triggerHeapProfileMessage = {
|
|
@@ -630,9 +682,6 @@ test('should handle trigger-heapprofile command and upload heap profiles from se
|
|
|
630
682
|
|
|
631
683
|
equal(service1Req.serviceId, 'service-1')
|
|
632
684
|
equal(service2Req.serviceId, 'service-2')
|
|
633
|
-
|
|
634
|
-
if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
|
|
635
|
-
await app.closeUpdates()
|
|
636
685
|
})
|
|
637
686
|
|
|
638
687
|
test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (t) => {
|
|
@@ -640,11 +689,6 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
|
|
|
640
689
|
|
|
641
690
|
const receivedMessages = []
|
|
642
691
|
const infoLogs = []
|
|
643
|
-
let errorCount = 0
|
|
644
|
-
let uploadResolve
|
|
645
|
-
const allUploadsComplete = new Promise((resolve) => {
|
|
646
|
-
uploadResolve = resolve
|
|
647
|
-
})
|
|
648
692
|
|
|
649
693
|
const wss = new WebSocketServer({ port: port + 4 })
|
|
650
694
|
t.after(async () => wss.close())
|
|
@@ -655,19 +699,21 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
|
|
|
655
699
|
true
|
|
656
700
|
)
|
|
657
701
|
|
|
658
|
-
const app = createMockApp(port + 4
|
|
702
|
+
const app = createMockApp(port + 4, true, {
|
|
703
|
+
PLT_FLAMEGRAPHS_INTERVAL_SEC: 10,
|
|
704
|
+
PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT: 1000
|
|
705
|
+
})
|
|
706
|
+
|
|
659
707
|
const originalInfo = app.log.info
|
|
660
708
|
app.log.info = (...args) => {
|
|
661
709
|
originalInfo(...args)
|
|
662
|
-
|
|
663
|
-
infoLogs.push(args)
|
|
664
|
-
errorCount++
|
|
665
|
-
if (errorCount === 2) {
|
|
666
|
-
uploadResolve()
|
|
667
|
-
}
|
|
668
|
-
}
|
|
710
|
+
infoLogs.push(args)
|
|
669
711
|
}
|
|
670
712
|
|
|
713
|
+
// Profile will be generated in 10s
|
|
714
|
+
const profileGenerationDate = Date.now() + 10000
|
|
715
|
+
const mockProfile = new Uint8Array([1, 2, 3, 4, 5])
|
|
716
|
+
|
|
671
717
|
app.watt.runtime.sendCommandToApplication = async (
|
|
672
718
|
serviceId,
|
|
673
719
|
command
|
|
@@ -676,9 +722,13 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
|
|
|
676
722
|
return { success: true }
|
|
677
723
|
}
|
|
678
724
|
if (command === 'getLastProfile') {
|
|
679
|
-
const
|
|
680
|
-
|
|
681
|
-
|
|
725
|
+
const now = Date.now()
|
|
726
|
+
if (now < profileGenerationDate) {
|
|
727
|
+
const error = new Error('No profile available - wait for profiling to complete or trigger manual capture')
|
|
728
|
+
error.code = 'PLT_PPROF_NO_PROFILE_AVAILABLE'
|
|
729
|
+
throw error
|
|
730
|
+
}
|
|
731
|
+
return mockProfile
|
|
682
732
|
}
|
|
683
733
|
return { success: false }
|
|
684
734
|
}
|
|
@@ -689,6 +739,13 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
|
|
|
689
739
|
await app.connectToUpdates()
|
|
690
740
|
await app.setupFlamegraphs()
|
|
691
741
|
|
|
742
|
+
t.after(async () => {
|
|
743
|
+
if (app.cleanupFlamegraphs) {
|
|
744
|
+
app.cleanupFlamegraphs()
|
|
745
|
+
}
|
|
746
|
+
await app.closeUpdates()
|
|
747
|
+
})
|
|
748
|
+
|
|
692
749
|
await waitForClientSubscription
|
|
693
750
|
|
|
694
751
|
const triggerFlamegraphMessage = {
|
|
@@ -697,15 +754,47 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
|
|
|
697
754
|
|
|
698
755
|
getWs().send(JSON.stringify(triggerFlamegraphMessage))
|
|
699
756
|
|
|
700
|
-
await
|
|
757
|
+
await sleep(15000)
|
|
701
758
|
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
759
|
+
const service1AttemptLogs = []
|
|
760
|
+
const service2AttemptLogs = []
|
|
761
|
+
const service1SuccessLogs = []
|
|
762
|
+
const service2SuccessLogs = []
|
|
706
763
|
|
|
707
|
-
|
|
708
|
-
|
|
764
|
+
for (const infoLog of infoLogs) {
|
|
765
|
+
if (infoLog.length !== 2) continue
|
|
766
|
+
const [options, message] = infoLog
|
|
767
|
+
|
|
768
|
+
if (message.includes('No profile available for the service')) {
|
|
769
|
+
const { workerId, attempt, maxAttempts, attemptTimeout } = options
|
|
770
|
+
|
|
771
|
+
equal(maxAttempts, 11)
|
|
772
|
+
equal(attemptTimeout, 1000)
|
|
773
|
+
|
|
774
|
+
if (workerId === 'service-1') {
|
|
775
|
+
service1AttemptLogs.push(infoLog)
|
|
776
|
+
equal(attempt, service1AttemptLogs.length)
|
|
777
|
+
}
|
|
778
|
+
if (workerId === 'service-2') {
|
|
779
|
+
service2AttemptLogs.push(infoLog)
|
|
780
|
+
equal(attempt, service2AttemptLogs.length)
|
|
781
|
+
}
|
|
782
|
+
continue
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
if (message.includes('Sending flamegraph')) {
|
|
786
|
+
if (options.serviceId === 'service-1') {
|
|
787
|
+
service1SuccessLogs.push(infoLog)
|
|
788
|
+
} else if (options.serviceId === 'service-2') {
|
|
789
|
+
service2SuccessLogs.push(infoLog)
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
equal(service1AttemptLogs.length, 10)
|
|
795
|
+
equal(service2AttemptLogs.length, 10)
|
|
796
|
+
equal(service1SuccessLogs.length, 1)
|
|
797
|
+
equal(service2SuccessLogs.length, 1)
|
|
709
798
|
})
|
|
710
799
|
|
|
711
800
|
test('should handle PLT_PPROF_NOT_ENOUGH_ELU error with info log', async (t) => {
|
|
@@ -762,6 +851,13 @@ test('should handle PLT_PPROF_NOT_ENOUGH_ELU error with info log', async (t) =>
|
|
|
762
851
|
await app.connectToUpdates()
|
|
763
852
|
await app.setupFlamegraphs()
|
|
764
853
|
|
|
854
|
+
t.after(async () => {
|
|
855
|
+
if (app.cleanupFlamegraphs) {
|
|
856
|
+
app.cleanupFlamegraphs()
|
|
857
|
+
}
|
|
858
|
+
await app.closeUpdates()
|
|
859
|
+
})
|
|
860
|
+
|
|
765
861
|
await waitForClientSubscription
|
|
766
862
|
|
|
767
863
|
const triggerFlamegraphMessage = {
|
|
@@ -773,12 +869,8 @@ test('should handle PLT_PPROF_NOT_ENOUGH_ELU error with info log', async (t) =>
|
|
|
773
869
|
await allUploadsComplete
|
|
774
870
|
|
|
775
871
|
equal(infoLogs.length, 2)
|
|
776
|
-
equal(infoLogs[0][0].
|
|
777
|
-
equal(infoLogs[0][0].podId, 'test-pod-123')
|
|
872
|
+
equal(infoLogs[0][0].workerId, 'service-1')
|
|
778
873
|
equal(infoLogs[0][1], 'ELU low, CPU profiling not active')
|
|
779
|
-
|
|
780
|
-
if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
|
|
781
|
-
await app.closeUpdates()
|
|
782
874
|
})
|
|
783
875
|
|
|
784
876
|
test('should start profiling on new workers that start after initial setup', async (t) => {
|
|
@@ -815,6 +907,13 @@ test('should start profiling on new workers that start after initial setup', asy
|
|
|
815
907
|
await app.connectToUpdates()
|
|
816
908
|
await app.setupFlamegraphs()
|
|
817
909
|
|
|
910
|
+
t.after(async () => {
|
|
911
|
+
if (app.cleanupFlamegraphs) {
|
|
912
|
+
app.cleanupFlamegraphs()
|
|
913
|
+
}
|
|
914
|
+
await app.closeUpdates()
|
|
915
|
+
})
|
|
916
|
+
|
|
818
917
|
await waitForClientSubscription
|
|
819
918
|
|
|
820
919
|
equal(startProfilingCalls.length, 4)
|
|
@@ -844,9 +943,6 @@ test('should start profiling on new workers that start after initial setup', asy
|
|
|
844
943
|
equal(startProfilingCalls[5].options.durationMillis, 1000)
|
|
845
944
|
equal(startProfilingCalls[5].options.eluThreshold, 0)
|
|
846
945
|
equal(startProfilingCalls[5].options.type, 'heap')
|
|
847
|
-
|
|
848
|
-
if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
|
|
849
|
-
await app.closeUpdates()
|
|
850
946
|
})
|
|
851
947
|
|
|
852
948
|
test('should not start profiling on new workers when flamegraphs are disabled', async (t) => {
|
|
@@ -884,6 +980,13 @@ test('should not start profiling on new workers when flamegraphs are disabled',
|
|
|
884
980
|
await app.connectToUpdates()
|
|
885
981
|
await app.setupFlamegraphs()
|
|
886
982
|
|
|
983
|
+
t.after(async () => {
|
|
984
|
+
if (app.cleanupFlamegraphs) {
|
|
985
|
+
app.cleanupFlamegraphs()
|
|
986
|
+
}
|
|
987
|
+
await app.closeUpdates()
|
|
988
|
+
})
|
|
989
|
+
|
|
887
990
|
await waitForClientSubscription
|
|
888
991
|
|
|
889
992
|
equal(startProfilingCalls.length, 0)
|
|
@@ -897,9 +1000,6 @@ test('should not start profiling on new workers when flamegraphs are disabled',
|
|
|
897
1000
|
await sleep(10)
|
|
898
1001
|
|
|
899
1002
|
equal(startProfilingCalls.length, 0)
|
|
900
|
-
|
|
901
|
-
if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
|
|
902
|
-
await app.closeUpdates()
|
|
903
1003
|
})
|
|
904
1004
|
|
|
905
1005
|
test('sendFlamegraphs should include alertId in query params when provided', async (t) => {
|