@platformatic/watt-extra 1.7.0 → 1.7.1-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,146 +1,234 @@
1
1
  'use strict'
2
2
 
3
- import { setTimeout as sleep } from 'node:timers/promises'
4
3
  import { request } from 'undici'
5
4
 
6
- async function flamegraphs (app, _opts) {
7
- const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
8
- const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
9
- const flamegraphsELUThreshold = app.env.PLT_FLAMEGRAPHS_ELU_THRESHOLD
10
- const flamegraphsGracePeriod = app.env.PLT_FLAMEGRAPHS_GRACE_PERIOD
11
- const flamegraphsAttemptTimeout = app.env.PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT
12
- const flamegraphsCacheCleanupInterval = app.env.PLT_FLAMEGRAPHS_CACHE_CLEANUP_INTERVAL
5
+ export class Profiler {
6
+ #workerId
7
+ #type
8
+ #duration
9
+ #profileOptions
10
+ #runtime
11
+ #log
12
+ #requests
13
+ #isProfiling
14
+ #onProfile
15
+ #getProfileInterval
16
+ #stopProfileTimeout
17
+
18
+ constructor (options = {}) {
19
+ const { type, duration, workerId, sourceMaps, app, onProfile } = options
20
+
21
+ if (type !== 'cpu' && type !== 'heap') {
22
+ throw new Error('Invalid Profiler type. Must be either "cpu" or "heap"')
23
+ }
24
+ if (typeof duration !== 'number') {
25
+ throw new Error('Invalid Profiler duration. Must be a number')
26
+ }
27
+ if (typeof workerId !== 'string') {
28
+ throw new Error('Invalid Worker ID. Must be a string')
29
+ }
30
+ if (!workerId.includes(':')) {
31
+ throw new Error('Worker ID must include the service ID and worker index')
32
+ }
33
+ if (typeof onProfile !== 'function') {
34
+ throw new Error('Invalid onProfile handler. Must be a function')
35
+ }
13
36
 
14
- const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
15
- const eluThreshold = parseFloat(flamegraphsELUThreshold)
16
- const gracePeriod = parseInt(flamegraphsGracePeriod)
17
- const attemptTimeout = Math.min(parseInt(flamegraphsAttemptTimeout), durationMillis)
18
- const maxAttempts = Math.ceil(durationMillis / attemptTimeout) + 1
19
- const cacheCleanupInterval = parseInt(flamegraphsCacheCleanupInterval)
37
+ this.#type = type
38
+ this.#duration = duration
39
+ this.#workerId = workerId
40
+ this.#onProfile = onProfile
20
41
 
21
- let workerStartedListener = null
42
+ this.#profileOptions = {
43
+ type,
44
+ durationMillis: duration,
45
+ sourceMaps: sourceMaps ?? false
46
+ }
22
47
 
23
- const startProfilingOnWorker = async (runtime, workerFullId, logContext = {}) => {
24
- await sleep(gracePeriod)
48
+ this.#requests = []
49
+ this.#isProfiling = false
25
50
 
26
- // Get application details to read service-level sourceMaps setting
27
- const appDetails = await runtime.getApplicationDetails(workerFullId)
28
- const sourceMaps = appDetails.sourceMaps ?? false
51
+ this.#runtime = app.watt.runtime
52
+ this.#log = app.log.child({
53
+ workerId: this.#workerId,
54
+ profilerType: this.#type
55
+ })
56
+ }
29
57
 
30
- try {
31
- // Start CPU profiling
32
- await runtime.sendCommandToApplication(
33
- workerFullId,
34
- 'startProfiling',
35
- { durationMillis, eluThreshold, type: 'cpu', sourceMaps }
36
- )
58
+ get workerId () {
59
+ return this.#workerId
60
+ }
37
61
 
38
- // Start HEAP profiling
39
- await runtime.sendCommandToApplication(
40
- workerFullId,
41
- 'startProfiling',
42
- { durationMillis, eluThreshold, type: 'heap', sourceMaps }
43
- )
44
- } catch (err) {
45
- app.log.error({ err, ...logContext }, 'Failed to start profiling')
46
- throw err
62
+ get isProfiling () {
63
+ return this.#isProfiling
64
+ }
65
+
66
+ async requestProfile (request = {}) {
67
+ request.timestamp ??= Date.now()
68
+ this.#requests.push(request)
69
+ this.#unscheduleStopProfiling()
70
+
71
+ if (!this.#isProfiling) {
72
+ this.#startProfilingLoop()
47
73
  }
48
74
  }
49
75
 
50
- app.setupFlamegraphs = async () => {
51
- if (isFlamegraphsDisabled) {
52
- app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
76
+ async stop () {
77
+ process._rawDebug('--------PROFILER.STOP-------')
78
+ if (this.#getProfileInterval) {
79
+ clearInterval(this.#getProfileInterval)
80
+ this.#getProfileInterval = null
81
+ }
82
+ if (this.#stopProfileTimeout) {
83
+ clearTimeout(this.#stopProfileTimeout)
84
+ this.#stopProfileTimeout = null
85
+ }
86
+ if (this.#isProfiling) {
87
+ const requests = this.#getProfileRequests()
88
+ try {
89
+ const profile = await this.#stopProfiling()
90
+ if (requests.length > 0) {
91
+ this.#onProfile(null, profile, requests)
92
+ }
93
+ } catch (err) {
94
+ this.#log.error({ err }, 'Failed to stop profiling')
95
+ if (requests.length > 0) {
96
+ this.#onProfile(err, null, requests)
97
+ }
98
+ }
99
+ }
100
+ }
101
+
102
+ async #startProfilingLoop () {
103
+ try {
104
+ await this.#startProfiling()
105
+ } catch (err) {
106
+ this.#log.error({ err }, 'Failed to start profiling')
107
+ const requests = this.#getProfileRequests()
108
+ this.#onProfile(err, null, requests)
53
109
  return
54
110
  }
55
111
 
56
- app.log.info('Start profiling services')
112
+ this.#getProfileInterval = setInterval(
113
+ () => this.#processProfile(),
114
+ this.#duration
115
+ ).unref()
116
+ }
57
117
 
58
- const runtime = app.watt.runtime
59
- const workers = await runtime.getWorkers()
118
+ async #processProfile () {
119
+ try {
120
+ const profile = await this.#getProfile()
121
+ const requests = this.#getProfileRequests(profile.timestamp)
122
+ this.#onProfile(null, profile, requests)
123
+ } catch (err) {
124
+ this.#log.error({ err }, 'Failed to generate a profile')
125
+ const requests = this.#getProfileRequests()
126
+ this.#onProfile(err, null, requests)
127
+ }
60
128
 
61
- const promises = []
62
- for (const [workerFullId, workerInfo] of Object.entries(workers)) {
63
- if (workerInfo.status === 'started') {
64
- const promise = startProfilingOnWorker(runtime, workerFullId, { workerFullId })
65
- promises.push(promise)
66
- }
129
+ if (this.#requests.length === 0) {
130
+ this.#scheduleStopProfiling()
67
131
  }
132
+ }
68
133
 
69
- const results = await Promise.allSettled(promises)
70
- for (const result of results) {
71
- if (result.status === 'rejected') {
72
- app.log.error({ result }, 'Failed to start profiling')
73
- }
134
+ #scheduleStopProfiling () {
135
+ // Stop profiling after the duration/2 if there are no more requests
136
+ this.#stopProfileTimeout = setTimeout(
137
+ () => this.stop(),
138
+ this.#duration / 2
139
+ ).unref()
140
+ }
141
+
142
+ #unscheduleStopProfiling () {
143
+ if (this.#stopProfileTimeout) {
144
+ clearTimeout(this.#stopProfileTimeout)
145
+ this.#stopProfileTimeout = null
74
146
  }
147
+ }
75
148
 
76
- // Listen for new workers starting and start profiling on them
77
- workerStartedListener = ({ application, worker }) => {
78
- if (isFlamegraphsDisabled) {
79
- return
80
- }
149
+ async #startProfiling () {
150
+ this.#isProfiling = true
151
+ this.#log.info('Starting profiling')
81
152
 
82
- const workerFullId = [application, worker].join(':')
83
- app.log.info({ application, worker }, 'Starting profiling on new worker')
153
+ await this.#runtime.sendCommandToApplication(
154
+ this.#workerId, 'startProfiling', this.#profileOptions
155
+ )
156
+ }
157
+
158
+ async #stopProfiling () {
159
+ this.#isProfiling = false
160
+ this.#log.info('Stopping profiling')
84
161
 
85
- startProfilingOnWorker(runtime, workerFullId, { application, worker }).catch(() => {
86
- // Error already logged in startProfilingOnWorker
87
- })
162
+ try {
163
+ const profile = await this.#runtime.sendCommandToApplication(
164
+ this.#workerId, 'stopProfiling', this.#profileOptions
165
+ )
166
+ return profile
167
+ } catch (err) {
168
+ // Ignore errors if the app is already closing
169
+ this.#log.debug({ err }, 'Failed to stop profiling')
88
170
  }
89
- runtime.on('application:worker:started', workerStartedListener)
171
+ }
172
+
173
+ async #getProfile () {
174
+ this.#log.info('Getting profile from worker')
90
175
 
91
- setInterval(cleanupFlamegraphsCache, cacheCleanupInterval).unref()
176
+ const [state, profile] = await Promise.all([
177
+ this.#runtime.sendCommandToApplication(this.#workerId, 'getProfilingState', { type: this.#type }),
178
+ this.#runtime.sendCommandToApplication(this.#workerId, 'getLastProfile', { type: this.#type })
179
+ ])
180
+ return { data: profile, timestamp: state.latestProfileTimestamp }
92
181
  }
93
182
 
94
- app.cleanupFlamegraphs = async () => {
95
- if (workerStartedListener && app.watt?.runtime) {
96
- app.watt.runtime.removeListener('application:worker:started', workerStartedListener)
97
- workerStartedListener = null
183
+ #getProfileRequests (profileTimestamp) {
184
+ if (profileTimestamp === undefined) {
185
+ const requests = this.#requests
186
+ this.#requests = []
187
+ return requests
98
188
  }
99
189
 
100
- // Explicitly stop all active profiling sessions to avoid memory corruption
101
- if (!isFlamegraphsDisabled && app.watt?.runtime) {
102
- try {
103
- const workers = await app.watt.runtime.getWorkers()
104
- const stopPromises = []
105
- for (const workerFullId of Object.keys(workers)) {
106
- // Stop both CPU and heap profiling on each worker
107
- stopPromises.push(
108
- app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'cpu' })
109
- .catch(err => {
110
- // Ignore errors if profiling wasn't running
111
- if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
112
- app.log.warn({ err, workerFullId }, 'Failed to stop CPU profiling')
113
- }
114
- })
115
- )
116
- stopPromises.push(
117
- app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'heap' })
118
- .catch(err => {
119
- // Ignore errors if profiling wasn't running
120
- if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
121
- app.log.warn({ err, workerFullId }, 'Failed to stop heap profiling')
122
- }
123
- })
124
- )
125
- }
126
- await Promise.all(stopPromises)
127
- // Small delay to ensure native cleanup completes
128
- await sleep(100)
129
- } catch (err) {
130
- app.log.warn({ err }, 'Failed to stop profiling during cleanup')
190
+ let processedIndex = 0
191
+ for (let i = 0; i < this.#requests.length; i++) {
192
+ if (this.#requests[i].timestamp <= profileTimestamp) {
193
+ processedIndex = i + 1
131
194
  }
132
195
  }
196
+ return this.#requests.splice(0, processedIndex)
133
197
  }
198
+ }
134
199
 
135
- const profilesByWorkerId = {}
200
+ async function flamegraphs (app, _opts) {
201
+ const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
202
+ const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
203
+
204
+ const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
136
205
 
137
- app.sendFlamegraphs = async (options = {}) => {
206
+ const profilers = {}
207
+ const profilersConfigs = {}
208
+ const profilersPauseReqs = {}
209
+
210
+ app.setupFlamegraphs = async () => {
138
211
  if (isFlamegraphsDisabled) {
139
- app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
212
+ app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
140
213
  return
141
214
  }
142
215
 
143
- let { workerIds, alertId, profileType = 'cpu' } = options
216
+ const runtime = app.watt.runtime
217
+ const { applications } = await runtime.getApplications()
218
+
219
+ for (const application of applications) {
220
+ const appDetails = await runtime.getApplicationDetails(application.id)
221
+ const sourceMaps = appDetails.sourceMaps ?? false
222
+ profilersConfigs[application.id] = { durationMillis, sourceMaps }
223
+ }
224
+ }
225
+
226
+ app.requestFlamegraphs = async (options = {}) => {
227
+ process._rawDebug('--------REQUEST PROFILING-------', options)
228
+ if (isFlamegraphsDisabled) {
229
+ app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
230
+ return
231
+ }
144
232
 
145
233
  const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
146
234
  if (!scalerUrl) {
@@ -150,118 +238,147 @@ async function flamegraphs (app, _opts) {
150
238
 
151
239
  const runtime = app.watt.runtime
152
240
 
153
- if (!workerIds) {
154
- const { applications } = await runtime.getApplications()
155
- workerIds = applications.map(app => app.id)
156
- }
241
+ let { serviceIds, alertId, profileType = 'cpu' } = options
157
242
 
158
- cleanupFlamegraphsCache()
243
+ const servicesWorkers = {}
244
+ const workers = await runtime.getWorkers()
159
245
 
160
- const uploadPromises = workerIds.map(async (workerId) => {
161
- const serviceId = workerId.split(':')[0]
162
- const profileKey = `${workerId}:${profileType}`
246
+ for (const workerId in workers) {
247
+ const workerInfo = workers[workerId]
248
+ const serviceId = workerInfo.application
163
249
 
164
- let profile = profilesByWorkerId[profileKey]
165
- if (profile !== undefined) {
166
- if (alertId) {
167
- app.log.info(
168
- { workerId, alertId }, 'Flamegraph will be attached to the alert'
169
- )
170
- profile.waitingAlerts.push(alertId)
171
- }
250
+ servicesWorkers[serviceId] ??= []
251
+ servicesWorkers[serviceId].push(workerId)
252
+ }
172
253
 
173
- if (profile.flamegraphId === null) {
174
- app.log.info({ workerId }, 'Waiting for flamegraph to be generated and sent')
175
- return
254
+ for (const serviceId in profilers) {
255
+ const workerProfilers = profilers[serviceId]
256
+ for (const profileType in workerProfilers) {
257
+ const profiler = workerProfilers[profileType]
258
+ const workerId = profiler.workerId
259
+ if (workers[workerId]) continue
260
+ if (profiler.isProfiling) {
261
+ profiler.stop()
176
262
  }
263
+ delete profilers[serviceId][profileType]
264
+ }
265
+ }
266
+
267
+ serviceIds ??= Object.keys(servicesWorkers)
268
+
269
+ for (const serviceId of serviceIds) {
270
+ const { isPaused, remainingTimeSec } = isProfilingPaused(serviceId)
271
+ if (isPaused) {
272
+ app.log.info(
273
+ { serviceId },
274
+ `Skipping service profiling, it is paused for ${remainingTimeSec}s`
275
+ )
276
+ process._rawDebug('--------SKIP PROFILING-------', serviceId)
277
+ continue
177
278
  }
178
279
 
179
- if (profile === undefined) {
180
- profile = {
280
+ profilers[serviceId] ??= {}
281
+
282
+ let profiler = profilers[serviceId][profileType]
283
+ if (!profiler) {
284
+ const workerId = servicesWorkers[serviceId][0]
285
+ const config = profilersConfigs[serviceId]
286
+ profiler = new Profiler({
287
+ app,
288
+ workerId,
181
289
  type: profileType,
182
- data: null,
183
- timestamp: null,
184
- flamegraphId: null,
185
- waitingAlerts: []
186
- }
187
- profilesByWorkerId[profileKey] = profile
290
+ duration: config.durationMillis,
291
+ sourceMaps: config.sourceMaps,
292
+ onProfile: createProfileHandler(scalerUrl, workerId, profileType)
293
+ })
294
+ profilers[serviceId][profileType] = profiler
295
+ }
188
296
 
189
- const result = await getServiceFlamegraph(workerId, profileType)
190
- if (!result || !(result.data instanceof Uint8Array)) {
191
- app.log.error({ workerId }, 'Failed to get profile from service')
192
- delete profilesByWorkerId[profileKey]
193
- return
194
- }
297
+ process._rawDebug('--------REQUEST PROFILING-------', serviceId)
298
+ profiler.requestProfile({ alertId })
299
+ }
300
+ }
301
+
302
+ // Method to be called when the worker ELU is very high
303
+ // to stop profiling and wait for app to go back to normal
304
+ app.pauseProfiling = async (options = {}) => {
305
+ process._rawDebug('--------PAUSE PROFILING-------', options)
306
+ if (isFlamegraphsDisabled) {
307
+ app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
308
+ return
309
+ }
310
+
311
+ const { serviceId, timeout } = options
312
+
313
+ profilersPauseReqs[serviceId] = { timestamp: timeout + Date.now() }
314
+
315
+ const serviceProfilers = profilers[serviceId]
316
+ if (!serviceProfilers) {
317
+ app.log.debug({ serviceId }, 'Skipping service profiling pause, no profilers found')
318
+ return
319
+ }
320
+
321
+ for (const profilerType in profilers[serviceId]) {
322
+ const profiler = profilers[serviceId][profilerType]
323
+ app.log.info({ serviceId, profilerType }, 'Pausing service profiling due to high ELU')
324
+ await profiler.stop()
325
+ }
326
+ }
327
+
328
+ function isProfilingPaused (serviceId) {
329
+ let isPaused = false
330
+ let remainingTimeSec = 0
331
+
332
+ const pauseReq = profilersPauseReqs[serviceId]
333
+ if (pauseReq) {
334
+ const now = Date.now()
335
+ isPaused = pauseReq.timestamp > now
336
+ remainingTimeSec = Math.round((pauseReq.timestamp - now) / 1000)
337
+ process._rawDebug('--------IS PROFILING PAUSED-------', isPaused, remainingTimeSec)
338
+ }
339
+
340
+ return { isPaused, remainingTimeSec }
341
+ }
342
+
343
+ function createProfileHandler (scalerUrl, workerId, profileType) {
344
+ const serviceId = workerId.split(':')[0]
195
345
 
196
- profile.data = result.data
197
- profile.timestamp = result.timestamp
346
+ return async (err, profile, requests) => {
347
+ process._rawDebug('--------PROFILER HANDLER-------', !!profile, requests)
348
+ if (err) {
349
+ app.log.error({ err }, 'Failed to generate a profile')
350
+ return
198
351
  }
199
352
 
200
- if (profile.flamegraphId === null || !alertId) {
201
- try {
202
- const flamegraph = await sendServiceFlamegraph(
203
- scalerUrl,
204
- serviceId,
205
- profile.data,
206
- profileType,
207
- alertId
208
- )
209
- profile.flamegraphId = flamegraph.id
210
- } catch (err) {
211
- app.log.error({ err, workerId, alertId, profileType }, 'Failed to send flamegraph')
212
- delete profilesByWorkerId[profileKey]
213
- return
353
+ const alertIds = []
354
+ for (const request of requests) {
355
+ if (request.alertId) {
356
+ alertIds.push(request.alertId)
214
357
  }
215
358
  }
216
359
 
217
- const waitingAlerts = profile.waitingAlerts
218
- if (waitingAlerts.length > 0) {
219
- profile.waitingAlerts = []
220
- await _attachFlamegraphToAlerts(
360
+ try {
361
+ const alertId = alertIds.shift()
362
+ const flamegraph = await sendServiceFlamegraph(
221
363
  scalerUrl,
222
364
  serviceId,
223
- profile.flamegraphId,
224
365
  profile.data,
225
- profile.type,
226
- waitingAlerts
227
- )
228
- }
229
- })
230
-
231
- await Promise.all(uploadPromises)
232
- }
233
-
234
- async function getServiceFlamegraph (workerId, profileType, attempt = 1) {
235
- const runtime = app.watt.runtime
236
-
237
- app.log.info({ workerId, attempt, maxAttempts, attemptTimeout }, 'Getting profile from worker')
238
-
239
- try {
240
- const [state, profile] = await Promise.all([
241
- runtime.sendCommandToApplication(workerId, 'getProfilingState', { type: profileType }),
242
- runtime.sendCommandToApplication(workerId, 'getLastProfile', { type: profileType })
243
- ])
244
- return { data: profile, timestamp: state.latestProfileTimestamp }
245
- } catch (err) {
246
- if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
247
- app.log.info(
248
- { workerId, attempt, maxAttempts, attemptTimeout },
249
- 'No profile available for the service. Waiting for profiling to complete.'
366
+ profileType,
367
+ alertId
250
368
  )
251
- if (attempt <= maxAttempts) {
252
- await sleep(attemptTimeout)
253
- return getServiceFlamegraph(workerId, profileType, attempt + 1)
254
- }
255
- } else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
256
- app.log.info({ workerId }, 'ELU low, CPU profiling not active')
257
- } else {
258
- app.log.warn({ err, workerId }, 'Failed to get profile from a worker')
259
369
 
260
- const [serviceId, workerIndex] = workerId.split(':')
261
- if (workerIndex) {
262
- app.log.warn('Worker not available, trying to get profile from another worker')
263
- return getServiceFlamegraph(serviceId, profileType)
370
+ if (alertIds.length > 0) {
371
+ await _attachFlamegraphToAlerts(
372
+ scalerUrl,
373
+ serviceId,
374
+ flamegraph.id,
375
+ profile.data,
376
+ profileType,
377
+ alertIds
378
+ )
264
379
  }
380
+ } catch (err) {
381
+ app.log.error({ err, workerId }, 'Failed to send flamegraph')
265
382
  }
266
383
  }
267
384
  }
@@ -269,7 +386,7 @@ async function flamegraphs (app, _opts) {
269
386
  async function sendServiceFlamegraph (scalerUrl, serviceId, profile, profileType, alertId) {
270
387
  const podId = app.instanceId
271
388
  const url = `${scalerUrl}/pods/${podId}/services/${serviceId}/flamegraph`
272
- app.log.info({ serviceId, podId, profileType }, 'Sending flamegraph')
389
+ app.log.info({ serviceId, podId, profileType, alertId }, 'Sending flamegraph')
273
390
 
274
391
  const query = { profileType }
275
392
  if (alertId) {
@@ -293,8 +410,14 @@ async function flamegraphs (app, _opts) {
293
410
  throw new Error(`Failed to send flamegraph: ${error}`)
294
411
  }
295
412
 
296
- const response = await body.json()
297
- return response
413
+ const flamegraph = await body.json()
414
+
415
+ app.log.info(
416
+ { serviceId, podId, profileType, flamegraph },
417
+ 'Flamegraph successfully stored'
418
+ )
419
+
420
+ return flamegraph
298
421
  }
299
422
 
300
423
  // Function that supports ICC that doesn't have attach flamegraph API
@@ -367,15 +490,17 @@ async function flamegraphs (app, _opts) {
367
490
  }
368
491
  }
369
492
 
370
- function cleanupFlamegraphsCache () {
371
- const now = Date.now()
372
-
373
- for (const profileKey of Object.keys(profilesByWorkerId)) {
374
- const timestamp = profilesByWorkerId[profileKey]?.timestamp
375
- if (timestamp && now - timestamp > durationMillis) {
376
- delete profilesByWorkerId[profileKey]
493
+ app.cleanupFlamegraphs = async () => {
494
+ // Stop all tracked profilers in parallel
495
+ const stopPromises = []
496
+ for (const serviceId in profilers) {
497
+ const serviceProfilers = profilers[serviceId]
498
+ for (const profileType in serviceProfilers) {
499
+ const profiler = serviceProfilers[profileType]
500
+ stopPromises.push(profiler.stop())
377
501
  }
378
502
  }
503
+ await Promise.all(stopPromises)
379
504
  }
380
505
  }
381
506
 
@@ -51,6 +51,8 @@ async function healthSignals (app, _opts) {
51
51
  return
52
52
  }
53
53
 
54
+ const pauseEluThreshold = app.env.PLT_FLAMEGRAPHS_PAUSE_ELU_TRESHOLD
55
+ const pauseTimeout = app.env.PLT_FLAMEGRAPHS_PAUSE_TIMEOUT
54
56
  const eluThreshold = app.env.PLT_ELU_HEALTH_SIGNAL_THRESHOLD
55
57
 
56
58
  let heapThreshold = app.env.PLT_HEAP_HEALTH_SIGNAL_THRESHOLD
@@ -88,6 +90,10 @@ async function healthSignals (app, _opts) {
88
90
 
89
91
  const { elu, heapUsed, heapTotal } = currentHealth
90
92
 
93
+ if (elu >= pauseEluThreshold) {
94
+ app.pauseProfiling({ serviceId, timeout: pauseTimeout })
95
+ }
96
+
91
97
  if (elu > eluThreshold) {
92
98
  healthSignals.push({
93
99
  type: 'elu',
@@ -183,15 +189,10 @@ async function healthSignals (app, _opts) {
183
189
  app.log.error({ error }, 'Failed to send health signals to scaler')
184
190
  }
185
191
 
186
- const alert = await body.json()
192
+ const response = await body.json()
187
193
 
188
- app.sendFlamegraphs({
189
- serviceIds: [serviceId],
190
- workerIds: [workerId],
191
- alertId: alert.id
192
- }).catch(err => {
193
- app.log.error({ err }, 'Failed to send a flamegraph')
194
- })
194
+ app.requestFlamegraphs({ serviceIds: [serviceId], alertId: response.alertId })
195
+ .catch(err => app.log.error({ err }, 'Failed to send a flamegraph'))
195
196
  }
196
197
  }
197
198