@platformatic/watt-extra 1.7.0 → 1.7.1-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +5 -8
- package/package.json +1 -1
- package/plugins/alerts.js +11 -6
- package/plugins/env.js +2 -3
- package/plugins/flamegraphs.js +330 -205
- package/plugins/health-signals.js +9 -8
- package/plugins/update.js +2 -2
- package/test/alerts.test.js +106 -26
- package/test/fixtures/service-1/routes/root.cjs +13 -1
- package/test/health-signals.test.js +166 -10
- package/test/profiler.test.js +443 -0
- package/test/trigger-flamegraphs.test.js +257 -416
package/plugins/flamegraphs.js
CHANGED
|
@@ -1,146 +1,234 @@
|
|
|
1
1
|
'use strict'
|
|
2
2
|
|
|
3
|
-
import { setTimeout as sleep } from 'node:timers/promises'
|
|
4
3
|
import { request } from 'undici'
|
|
5
4
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
5
|
+
export class Profiler {
|
|
6
|
+
#workerId
|
|
7
|
+
#type
|
|
8
|
+
#duration
|
|
9
|
+
#profileOptions
|
|
10
|
+
#runtime
|
|
11
|
+
#log
|
|
12
|
+
#requests
|
|
13
|
+
#isProfiling
|
|
14
|
+
#onProfile
|
|
15
|
+
#getProfileInterval
|
|
16
|
+
#stopProfileTimeout
|
|
17
|
+
|
|
18
|
+
constructor (options = {}) {
|
|
19
|
+
const { type, duration, workerId, sourceMaps, app, onProfile } = options
|
|
20
|
+
|
|
21
|
+
if (type !== 'cpu' && type !== 'heap') {
|
|
22
|
+
throw new Error('Invalid Profiler type. Must be either "cpu" or "heap"')
|
|
23
|
+
}
|
|
24
|
+
if (typeof duration !== 'number') {
|
|
25
|
+
throw new Error('Invalid Profiler duration. Must be a number')
|
|
26
|
+
}
|
|
27
|
+
if (typeof workerId !== 'string') {
|
|
28
|
+
throw new Error('Invalid Worker ID. Must be a string')
|
|
29
|
+
}
|
|
30
|
+
if (!workerId.includes(':')) {
|
|
31
|
+
throw new Error('Worker ID must include the service ID and worker index')
|
|
32
|
+
}
|
|
33
|
+
if (typeof onProfile !== 'function') {
|
|
34
|
+
throw new Error('Invalid onProfile handler. Must be a function')
|
|
35
|
+
}
|
|
13
36
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
const maxAttempts = Math.ceil(durationMillis / attemptTimeout) + 1
|
|
19
|
-
const cacheCleanupInterval = parseInt(flamegraphsCacheCleanupInterval)
|
|
37
|
+
this.#type = type
|
|
38
|
+
this.#duration = duration
|
|
39
|
+
this.#workerId = workerId
|
|
40
|
+
this.#onProfile = onProfile
|
|
20
41
|
|
|
21
|
-
|
|
42
|
+
this.#profileOptions = {
|
|
43
|
+
type,
|
|
44
|
+
durationMillis: duration,
|
|
45
|
+
sourceMaps: sourceMaps ?? false
|
|
46
|
+
}
|
|
22
47
|
|
|
23
|
-
|
|
24
|
-
|
|
48
|
+
this.#requests = []
|
|
49
|
+
this.#isProfiling = false
|
|
25
50
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
51
|
+
this.#runtime = app.watt.runtime
|
|
52
|
+
this.#log = app.log.child({
|
|
53
|
+
workerId: this.#workerId,
|
|
54
|
+
profilerType: this.#type
|
|
55
|
+
})
|
|
56
|
+
}
|
|
29
57
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
workerFullId,
|
|
34
|
-
'startProfiling',
|
|
35
|
-
{ durationMillis, eluThreshold, type: 'cpu', sourceMaps }
|
|
36
|
-
)
|
|
58
|
+
get workerId () {
|
|
59
|
+
return this.#workerId
|
|
60
|
+
}
|
|
37
61
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
62
|
+
get isProfiling () {
|
|
63
|
+
return this.#isProfiling
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
async requestProfile (request = {}) {
|
|
67
|
+
request.timestamp ??= Date.now()
|
|
68
|
+
this.#requests.push(request)
|
|
69
|
+
this.#unscheduleStopProfiling()
|
|
70
|
+
|
|
71
|
+
if (!this.#isProfiling) {
|
|
72
|
+
this.#startProfilingLoop()
|
|
47
73
|
}
|
|
48
74
|
}
|
|
49
75
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
76
|
+
async stop () {
|
|
77
|
+
process._rawDebug('--------PROFILER.STOP-------')
|
|
78
|
+
if (this.#getProfileInterval) {
|
|
79
|
+
clearInterval(this.#getProfileInterval)
|
|
80
|
+
this.#getProfileInterval = null
|
|
81
|
+
}
|
|
82
|
+
if (this.#stopProfileTimeout) {
|
|
83
|
+
clearTimeout(this.#stopProfileTimeout)
|
|
84
|
+
this.#stopProfileTimeout = null
|
|
85
|
+
}
|
|
86
|
+
if (this.#isProfiling) {
|
|
87
|
+
const requests = this.#getProfileRequests()
|
|
88
|
+
try {
|
|
89
|
+
const profile = await this.#stopProfiling()
|
|
90
|
+
if (requests.length > 0) {
|
|
91
|
+
this.#onProfile(null, profile, requests)
|
|
92
|
+
}
|
|
93
|
+
} catch (err) {
|
|
94
|
+
this.#log.error({ err }, 'Failed to stop profiling')
|
|
95
|
+
if (requests.length > 0) {
|
|
96
|
+
this.#onProfile(err, null, requests)
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
async #startProfilingLoop () {
|
|
103
|
+
try {
|
|
104
|
+
await this.#startProfiling()
|
|
105
|
+
} catch (err) {
|
|
106
|
+
this.#log.error({ err }, 'Failed to start profiling')
|
|
107
|
+
const requests = this.#getProfileRequests()
|
|
108
|
+
this.#onProfile(err, null, requests)
|
|
53
109
|
return
|
|
54
110
|
}
|
|
55
111
|
|
|
56
|
-
|
|
112
|
+
this.#getProfileInterval = setInterval(
|
|
113
|
+
() => this.#processProfile(),
|
|
114
|
+
this.#duration
|
|
115
|
+
).unref()
|
|
116
|
+
}
|
|
57
117
|
|
|
58
|
-
|
|
59
|
-
|
|
118
|
+
async #processProfile () {
|
|
119
|
+
try {
|
|
120
|
+
const profile = await this.#getProfile()
|
|
121
|
+
const requests = this.#getProfileRequests(profile.timestamp)
|
|
122
|
+
this.#onProfile(null, profile, requests)
|
|
123
|
+
} catch (err) {
|
|
124
|
+
this.#log.error({ err }, 'Failed to generate a profile')
|
|
125
|
+
const requests = this.#getProfileRequests()
|
|
126
|
+
this.#onProfile(err, null, requests)
|
|
127
|
+
}
|
|
60
128
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
if (workerInfo.status === 'started') {
|
|
64
|
-
const promise = startProfilingOnWorker(runtime, workerFullId, { workerFullId })
|
|
65
|
-
promises.push(promise)
|
|
66
|
-
}
|
|
129
|
+
if (this.#requests.length === 0) {
|
|
130
|
+
this.#scheduleStopProfiling()
|
|
67
131
|
}
|
|
132
|
+
}
|
|
68
133
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
134
|
+
#scheduleStopProfiling () {
|
|
135
|
+
// Stop profiling after the duration/2 if there are no more requests
|
|
136
|
+
this.#stopProfileTimeout = setTimeout(
|
|
137
|
+
() => this.stop(),
|
|
138
|
+
this.#duration / 2
|
|
139
|
+
).unref()
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
#unscheduleStopProfiling () {
|
|
143
|
+
if (this.#stopProfileTimeout) {
|
|
144
|
+
clearTimeout(this.#stopProfileTimeout)
|
|
145
|
+
this.#stopProfileTimeout = null
|
|
74
146
|
}
|
|
147
|
+
}
|
|
75
148
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
return
|
|
80
|
-
}
|
|
149
|
+
async #startProfiling () {
|
|
150
|
+
this.#isProfiling = true
|
|
151
|
+
this.#log.info('Starting profiling')
|
|
81
152
|
|
|
82
|
-
|
|
83
|
-
|
|
153
|
+
await this.#runtime.sendCommandToApplication(
|
|
154
|
+
this.#workerId, 'startProfiling', this.#profileOptions
|
|
155
|
+
)
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
async #stopProfiling () {
|
|
159
|
+
this.#isProfiling = false
|
|
160
|
+
this.#log.info('Stopping profiling')
|
|
84
161
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
162
|
+
try {
|
|
163
|
+
const profile = await this.#runtime.sendCommandToApplication(
|
|
164
|
+
this.#workerId, 'stopProfiling', this.#profileOptions
|
|
165
|
+
)
|
|
166
|
+
return profile
|
|
167
|
+
} catch (err) {
|
|
168
|
+
// Ignore errors if the app is already closing
|
|
169
|
+
this.#log.debug({ err }, 'Failed to stop profiling')
|
|
88
170
|
}
|
|
89
|
-
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
async #getProfile () {
|
|
174
|
+
this.#log.info('Getting profile from worker')
|
|
90
175
|
|
|
91
|
-
|
|
176
|
+
const [state, profile] = await Promise.all([
|
|
177
|
+
this.#runtime.sendCommandToApplication(this.#workerId, 'getProfilingState', { type: this.#type }),
|
|
178
|
+
this.#runtime.sendCommandToApplication(this.#workerId, 'getLastProfile', { type: this.#type })
|
|
179
|
+
])
|
|
180
|
+
return { data: profile, timestamp: state.latestProfileTimestamp }
|
|
92
181
|
}
|
|
93
182
|
|
|
94
|
-
|
|
95
|
-
if (
|
|
96
|
-
|
|
97
|
-
|
|
183
|
+
#getProfileRequests (profileTimestamp) {
|
|
184
|
+
if (profileTimestamp === undefined) {
|
|
185
|
+
const requests = this.#requests
|
|
186
|
+
this.#requests = []
|
|
187
|
+
return requests
|
|
98
188
|
}
|
|
99
189
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
const stopPromises = []
|
|
105
|
-
for (const workerFullId of Object.keys(workers)) {
|
|
106
|
-
// Stop both CPU and heap profiling on each worker
|
|
107
|
-
stopPromises.push(
|
|
108
|
-
app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'cpu' })
|
|
109
|
-
.catch(err => {
|
|
110
|
-
// Ignore errors if profiling wasn't running
|
|
111
|
-
if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
|
|
112
|
-
app.log.warn({ err, workerFullId }, 'Failed to stop CPU profiling')
|
|
113
|
-
}
|
|
114
|
-
})
|
|
115
|
-
)
|
|
116
|
-
stopPromises.push(
|
|
117
|
-
app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'heap' })
|
|
118
|
-
.catch(err => {
|
|
119
|
-
// Ignore errors if profiling wasn't running
|
|
120
|
-
if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
|
|
121
|
-
app.log.warn({ err, workerFullId }, 'Failed to stop heap profiling')
|
|
122
|
-
}
|
|
123
|
-
})
|
|
124
|
-
)
|
|
125
|
-
}
|
|
126
|
-
await Promise.all(stopPromises)
|
|
127
|
-
// Small delay to ensure native cleanup completes
|
|
128
|
-
await sleep(100)
|
|
129
|
-
} catch (err) {
|
|
130
|
-
app.log.warn({ err }, 'Failed to stop profiling during cleanup')
|
|
190
|
+
let processedIndex = 0
|
|
191
|
+
for (let i = 0; i < this.#requests.length; i++) {
|
|
192
|
+
if (this.#requests[i].timestamp <= profileTimestamp) {
|
|
193
|
+
processedIndex = i + 1
|
|
131
194
|
}
|
|
132
195
|
}
|
|
196
|
+
return this.#requests.splice(0, processedIndex)
|
|
133
197
|
}
|
|
198
|
+
}
|
|
134
199
|
|
|
135
|
-
|
|
200
|
+
async function flamegraphs (app, _opts) {
|
|
201
|
+
const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
|
|
202
|
+
const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
|
|
203
|
+
|
|
204
|
+
const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
|
|
136
205
|
|
|
137
|
-
|
|
206
|
+
const profilers = {}
|
|
207
|
+
const profilersConfigs = {}
|
|
208
|
+
const profilersPauseReqs = {}
|
|
209
|
+
|
|
210
|
+
app.setupFlamegraphs = async () => {
|
|
138
211
|
if (isFlamegraphsDisabled) {
|
|
139
|
-
app.log.info('PLT_DISABLE_FLAMEGRAPHS is set,
|
|
212
|
+
app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
|
|
140
213
|
return
|
|
141
214
|
}
|
|
142
215
|
|
|
143
|
-
|
|
216
|
+
const runtime = app.watt.runtime
|
|
217
|
+
const { applications } = await runtime.getApplications()
|
|
218
|
+
|
|
219
|
+
for (const application of applications) {
|
|
220
|
+
const appDetails = await runtime.getApplicationDetails(application.id)
|
|
221
|
+
const sourceMaps = appDetails.sourceMaps ?? false
|
|
222
|
+
profilersConfigs[application.id] = { durationMillis, sourceMaps }
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
app.requestFlamegraphs = async (options = {}) => {
|
|
227
|
+
process._rawDebug('--------REQUEST PROFILING-------', options)
|
|
228
|
+
if (isFlamegraphsDisabled) {
|
|
229
|
+
app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
|
|
230
|
+
return
|
|
231
|
+
}
|
|
144
232
|
|
|
145
233
|
const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
|
|
146
234
|
if (!scalerUrl) {
|
|
@@ -150,118 +238,147 @@ async function flamegraphs (app, _opts) {
|
|
|
150
238
|
|
|
151
239
|
const runtime = app.watt.runtime
|
|
152
240
|
|
|
153
|
-
|
|
154
|
-
const { applications } = await runtime.getApplications()
|
|
155
|
-
workerIds = applications.map(app => app.id)
|
|
156
|
-
}
|
|
241
|
+
let { serviceIds, alertId, profileType = 'cpu' } = options
|
|
157
242
|
|
|
158
|
-
|
|
243
|
+
const servicesWorkers = {}
|
|
244
|
+
const workers = await runtime.getWorkers()
|
|
159
245
|
|
|
160
|
-
const
|
|
161
|
-
const
|
|
162
|
-
const
|
|
246
|
+
for (const workerId in workers) {
|
|
247
|
+
const workerInfo = workers[workerId]
|
|
248
|
+
const serviceId = workerInfo.application
|
|
163
249
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
app.log.info(
|
|
168
|
-
{ workerId, alertId }, 'Flamegraph will be attached to the alert'
|
|
169
|
-
)
|
|
170
|
-
profile.waitingAlerts.push(alertId)
|
|
171
|
-
}
|
|
250
|
+
servicesWorkers[serviceId] ??= []
|
|
251
|
+
servicesWorkers[serviceId].push(workerId)
|
|
252
|
+
}
|
|
172
253
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
254
|
+
for (const serviceId in profilers) {
|
|
255
|
+
const workerProfilers = profilers[serviceId]
|
|
256
|
+
for (const profileType in workerProfilers) {
|
|
257
|
+
const profiler = workerProfilers[profileType]
|
|
258
|
+
const workerId = profiler.workerId
|
|
259
|
+
if (workers[workerId]) continue
|
|
260
|
+
if (profiler.isProfiling) {
|
|
261
|
+
profiler.stop()
|
|
176
262
|
}
|
|
263
|
+
delete profilers[serviceId][profileType]
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
serviceIds ??= Object.keys(servicesWorkers)
|
|
268
|
+
|
|
269
|
+
for (const serviceId of serviceIds) {
|
|
270
|
+
const { isPaused, remainingTimeSec } = isProfilingPaused(serviceId)
|
|
271
|
+
if (isPaused) {
|
|
272
|
+
app.log.info(
|
|
273
|
+
{ serviceId },
|
|
274
|
+
`Skipping service profiling, it is paused for ${remainingTimeSec}s`
|
|
275
|
+
)
|
|
276
|
+
process._rawDebug('--------SKIP PROFILING-------', serviceId)
|
|
277
|
+
continue
|
|
177
278
|
}
|
|
178
279
|
|
|
179
|
-
|
|
180
|
-
|
|
280
|
+
profilers[serviceId] ??= {}
|
|
281
|
+
|
|
282
|
+
let profiler = profilers[serviceId][profileType]
|
|
283
|
+
if (!profiler) {
|
|
284
|
+
const workerId = servicesWorkers[serviceId][0]
|
|
285
|
+
const config = profilersConfigs[serviceId]
|
|
286
|
+
profiler = new Profiler({
|
|
287
|
+
app,
|
|
288
|
+
workerId,
|
|
181
289
|
type: profileType,
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
290
|
+
duration: config.durationMillis,
|
|
291
|
+
sourceMaps: config.sourceMaps,
|
|
292
|
+
onProfile: createProfileHandler(scalerUrl, workerId, profileType)
|
|
293
|
+
})
|
|
294
|
+
profilers[serviceId][profileType] = profiler
|
|
295
|
+
}
|
|
188
296
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
297
|
+
process._rawDebug('--------REQUEST PROFILING-------', serviceId)
|
|
298
|
+
profiler.requestProfile({ alertId })
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// Method to be called when the worker ELU is very high
|
|
303
|
+
// to stop profiling and wait for app to go back to normal
|
|
304
|
+
app.pauseProfiling = async (options = {}) => {
|
|
305
|
+
process._rawDebug('--------PAUSE PROFILING-------', options)
|
|
306
|
+
if (isFlamegraphsDisabled) {
|
|
307
|
+
app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
|
|
308
|
+
return
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
const { serviceId, timeout } = options
|
|
312
|
+
|
|
313
|
+
profilersPauseReqs[serviceId] = { timestamp: timeout + Date.now() }
|
|
314
|
+
|
|
315
|
+
const serviceProfilers = profilers[serviceId]
|
|
316
|
+
if (!serviceProfilers) {
|
|
317
|
+
app.log.debug({ serviceId }, 'Skipping service profiling pause, no profilers found')
|
|
318
|
+
return
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
for (const profilerType in profilers[serviceId]) {
|
|
322
|
+
const profiler = profilers[serviceId][profilerType]
|
|
323
|
+
app.log.info({ serviceId, profilerType }, 'Pausing service profiling due to high ELU')
|
|
324
|
+
await profiler.stop()
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
function isProfilingPaused (serviceId) {
|
|
329
|
+
let isPaused = false
|
|
330
|
+
let remainingTimeSec = 0
|
|
331
|
+
|
|
332
|
+
const pauseReq = profilersPauseReqs[serviceId]
|
|
333
|
+
if (pauseReq) {
|
|
334
|
+
const now = Date.now()
|
|
335
|
+
isPaused = pauseReq.timestamp > now
|
|
336
|
+
remainingTimeSec = Math.round((pauseReq.timestamp - now) / 1000)
|
|
337
|
+
process._rawDebug('--------IS PROFILING PAUSED-------', isPaused, remainingTimeSec)
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
return { isPaused, remainingTimeSec }
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
function createProfileHandler (scalerUrl, workerId, profileType) {
|
|
344
|
+
const serviceId = workerId.split(':')[0]
|
|
195
345
|
|
|
196
|
-
|
|
197
|
-
|
|
346
|
+
return async (err, profile, requests) => {
|
|
347
|
+
process._rawDebug('--------PROFILER HANDLER-------', !!profile, requests)
|
|
348
|
+
if (err) {
|
|
349
|
+
app.log.error({ err }, 'Failed to generate a profile')
|
|
350
|
+
return
|
|
198
351
|
}
|
|
199
352
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
serviceId,
|
|
205
|
-
profile.data,
|
|
206
|
-
profileType,
|
|
207
|
-
alertId
|
|
208
|
-
)
|
|
209
|
-
profile.flamegraphId = flamegraph.id
|
|
210
|
-
} catch (err) {
|
|
211
|
-
app.log.error({ err, workerId, alertId, profileType }, 'Failed to send flamegraph')
|
|
212
|
-
delete profilesByWorkerId[profileKey]
|
|
213
|
-
return
|
|
353
|
+
const alertIds = []
|
|
354
|
+
for (const request of requests) {
|
|
355
|
+
if (request.alertId) {
|
|
356
|
+
alertIds.push(request.alertId)
|
|
214
357
|
}
|
|
215
358
|
}
|
|
216
359
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
await _attachFlamegraphToAlerts(
|
|
360
|
+
try {
|
|
361
|
+
const alertId = alertIds.shift()
|
|
362
|
+
const flamegraph = await sendServiceFlamegraph(
|
|
221
363
|
scalerUrl,
|
|
222
364
|
serviceId,
|
|
223
|
-
profile.flamegraphId,
|
|
224
365
|
profile.data,
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
)
|
|
228
|
-
}
|
|
229
|
-
})
|
|
230
|
-
|
|
231
|
-
await Promise.all(uploadPromises)
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
async function getServiceFlamegraph (workerId, profileType, attempt = 1) {
|
|
235
|
-
const runtime = app.watt.runtime
|
|
236
|
-
|
|
237
|
-
app.log.info({ workerId, attempt, maxAttempts, attemptTimeout }, 'Getting profile from worker')
|
|
238
|
-
|
|
239
|
-
try {
|
|
240
|
-
const [state, profile] = await Promise.all([
|
|
241
|
-
runtime.sendCommandToApplication(workerId, 'getProfilingState', { type: profileType }),
|
|
242
|
-
runtime.sendCommandToApplication(workerId, 'getLastProfile', { type: profileType })
|
|
243
|
-
])
|
|
244
|
-
return { data: profile, timestamp: state.latestProfileTimestamp }
|
|
245
|
-
} catch (err) {
|
|
246
|
-
if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
|
|
247
|
-
app.log.info(
|
|
248
|
-
{ workerId, attempt, maxAttempts, attemptTimeout },
|
|
249
|
-
'No profile available for the service. Waiting for profiling to complete.'
|
|
366
|
+
profileType,
|
|
367
|
+
alertId
|
|
250
368
|
)
|
|
251
|
-
if (attempt <= maxAttempts) {
|
|
252
|
-
await sleep(attemptTimeout)
|
|
253
|
-
return getServiceFlamegraph(workerId, profileType, attempt + 1)
|
|
254
|
-
}
|
|
255
|
-
} else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
|
|
256
|
-
app.log.info({ workerId }, 'ELU low, CPU profiling not active')
|
|
257
|
-
} else {
|
|
258
|
-
app.log.warn({ err, workerId }, 'Failed to get profile from a worker')
|
|
259
369
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
370
|
+
if (alertIds.length > 0) {
|
|
371
|
+
await _attachFlamegraphToAlerts(
|
|
372
|
+
scalerUrl,
|
|
373
|
+
serviceId,
|
|
374
|
+
flamegraph.id,
|
|
375
|
+
profile.data,
|
|
376
|
+
profileType,
|
|
377
|
+
alertIds
|
|
378
|
+
)
|
|
264
379
|
}
|
|
380
|
+
} catch (err) {
|
|
381
|
+
app.log.error({ err, workerId }, 'Failed to send flamegraph')
|
|
265
382
|
}
|
|
266
383
|
}
|
|
267
384
|
}
|
|
@@ -269,7 +386,7 @@ async function flamegraphs (app, _opts) {
|
|
|
269
386
|
async function sendServiceFlamegraph (scalerUrl, serviceId, profile, profileType, alertId) {
|
|
270
387
|
const podId = app.instanceId
|
|
271
388
|
const url = `${scalerUrl}/pods/${podId}/services/${serviceId}/flamegraph`
|
|
272
|
-
app.log.info({ serviceId, podId, profileType }, 'Sending flamegraph')
|
|
389
|
+
app.log.info({ serviceId, podId, profileType, alertId }, 'Sending flamegraph')
|
|
273
390
|
|
|
274
391
|
const query = { profileType }
|
|
275
392
|
if (alertId) {
|
|
@@ -293,8 +410,14 @@ async function flamegraphs (app, _opts) {
|
|
|
293
410
|
throw new Error(`Failed to send flamegraph: ${error}`)
|
|
294
411
|
}
|
|
295
412
|
|
|
296
|
-
const
|
|
297
|
-
|
|
413
|
+
const flamegraph = await body.json()
|
|
414
|
+
|
|
415
|
+
app.log.info(
|
|
416
|
+
{ serviceId, podId, profileType, flamegraph },
|
|
417
|
+
'Flamegraph successfully stored'
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
return flamegraph
|
|
298
421
|
}
|
|
299
422
|
|
|
300
423
|
// Function that supports ICC that doesn't have attach flamegraph API
|
|
@@ -367,15 +490,17 @@ async function flamegraphs (app, _opts) {
|
|
|
367
490
|
}
|
|
368
491
|
}
|
|
369
492
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
for (const
|
|
374
|
-
const
|
|
375
|
-
|
|
376
|
-
|
|
493
|
+
app.cleanupFlamegraphs = async () => {
|
|
494
|
+
// Stop all tracked profilers in parallel
|
|
495
|
+
const stopPromises = []
|
|
496
|
+
for (const serviceId in profilers) {
|
|
497
|
+
const serviceProfilers = profilers[serviceId]
|
|
498
|
+
for (const profileType in serviceProfilers) {
|
|
499
|
+
const profiler = serviceProfilers[profileType]
|
|
500
|
+
stopPromises.push(profiler.stop())
|
|
377
501
|
}
|
|
378
502
|
}
|
|
503
|
+
await Promise.all(stopPromises)
|
|
379
504
|
}
|
|
380
505
|
}
|
|
381
506
|
|
|
@@ -51,6 +51,8 @@ async function healthSignals (app, _opts) {
|
|
|
51
51
|
return
|
|
52
52
|
}
|
|
53
53
|
|
|
54
|
+
const pauseEluThreshold = app.env.PLT_FLAMEGRAPHS_PAUSE_ELU_TRESHOLD
|
|
55
|
+
const pauseTimeout = app.env.PLT_FLAMEGRAPHS_PAUSE_TIMEOUT
|
|
54
56
|
const eluThreshold = app.env.PLT_ELU_HEALTH_SIGNAL_THRESHOLD
|
|
55
57
|
|
|
56
58
|
let heapThreshold = app.env.PLT_HEAP_HEALTH_SIGNAL_THRESHOLD
|
|
@@ -88,6 +90,10 @@ async function healthSignals (app, _opts) {
|
|
|
88
90
|
|
|
89
91
|
const { elu, heapUsed, heapTotal } = currentHealth
|
|
90
92
|
|
|
93
|
+
if (elu >= pauseEluThreshold) {
|
|
94
|
+
app.pauseProfiling({ serviceId, timeout: pauseTimeout })
|
|
95
|
+
}
|
|
96
|
+
|
|
91
97
|
if (elu > eluThreshold) {
|
|
92
98
|
healthSignals.push({
|
|
93
99
|
type: 'elu',
|
|
@@ -183,15 +189,10 @@ async function healthSignals (app, _opts) {
|
|
|
183
189
|
app.log.error({ error }, 'Failed to send health signals to scaler')
|
|
184
190
|
}
|
|
185
191
|
|
|
186
|
-
const
|
|
192
|
+
const response = await body.json()
|
|
187
193
|
|
|
188
|
-
app.
|
|
189
|
-
|
|
190
|
-
workerIds: [workerId],
|
|
191
|
-
alertId: alert.id
|
|
192
|
-
}).catch(err => {
|
|
193
|
-
app.log.error({ err }, 'Failed to send a flamegraph')
|
|
194
|
-
})
|
|
194
|
+
app.requestFlamegraphs({ serviceIds: [serviceId], alertId: response.alertId })
|
|
195
|
+
.catch(err => app.log.error({ err }, 'Failed to send a flamegraph'))
|
|
195
196
|
}
|
|
196
197
|
}
|
|
197
198
|
|