@platformatic/watt-extra 1.6.3-alpha.5 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,14 @@
1
1
  {
2
2
  "permissions": {
3
3
  "allow": [
4
- "Bash(node --test-only:*)",
5
- "Bash(node --test:*)",
6
- "Bash(for i in {1..3})",
7
- "Bash(do echo \"=== Run $i ===\")",
8
- "Bash(done)"
4
+ "Read(//work/workspaces/workspace-platformatic/platformatic/**)",
5
+ "Bash(npx borp:*)",
6
+ "Bash(timeout 30 npx borp -c 1 --timeout=20000 ./test/trigger-flamegraphs.test.js)",
7
+ "Bash(xargs cat:*)",
8
+ "Bash(pnpm install)",
9
+ "Bash(find:*)",
10
+ "Bash(cat:*)",
11
+ "WebFetch(domain:github.com)"
9
12
  ],
10
13
  "deny": [],
11
14
  "ask": []
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@platformatic/watt-extra",
3
- "version": "1.6.3-alpha.5",
3
+ "version": "1.7.0",
4
4
  "description": "The Platformatic runtime manager",
5
5
  "type": "module",
6
6
  "scripts": {
package/plugins/alerts.js CHANGED
@@ -8,6 +8,7 @@ async function alerts (app, _opts) {
8
8
  app.instanceConfig?.scaler?.alertRetentionWindow || 10 * 1000
9
9
 
10
10
  const lastServicesAlertTime = {}
11
+ const workerStartTimes = new Map() // Track per-worker start times for grace period
11
12
 
12
13
  async function setupAlerts () {
13
14
  const scalerAlgorithmVersion = app.instanceConfig?.scaler?.version ?? 'v1'
@@ -17,6 +18,9 @@ async function alerts (app, _opts) {
17
18
  }
18
19
  app.log.info('Setting up v1 scaler alerts')
19
20
 
21
+ // Grace period during which alerts are suppressed per-worker.
22
+ const gracePeriodMs = app.env.PLT_ALERTS_GRACE_PERIOD_SEC * 1000
23
+
20
24
  // Skip alerts setup if ICC is not configured
21
25
  if (!app.env.PLT_ICC_URL) {
22
26
  app.log.info('PLT_ICC_URL not set, skipping alerts setup')
@@ -33,6 +37,18 @@ async function alerts (app, _opts) {
33
37
  return
34
38
  }
35
39
 
40
+ // Default start time for workers that started before the listener was registered
41
+ const pluginStartTime = Date.now()
42
+
43
+ // Listen for worker start events to track start times
44
+ runtime.on('application:worker:started', (workerInfo) => {
45
+ const workerId = workerInfo?.id
46
+ if (workerId) {
47
+ workerStartTimes.set(workerId, Date.now())
48
+ app.log.debug({ workerId }, 'Worker started, tracking for grace period')
49
+ }
50
+ })
51
+
36
52
  const processHealthInfo = async (healthInfo) => {
37
53
  if (!healthInfo) {
38
54
  app.log.error('No health info received')
@@ -55,6 +71,14 @@ async function alerts (app, _opts) {
55
71
  healthCache.splice(0, validIndex)
56
72
  }
57
73
 
74
+ // Skip sending alerts during worker's grace period.
75
+ // Use plugin start time as default for workers that started before the listener.
76
+ const workerStartTime = workerStartTimes.get(workerId) ?? pluginStartTime
77
+ if (timestamp - workerStartTime < gracePeriodMs) {
78
+ app.log.debug({ workerId }, 'Skipping alert during worker grace period')
79
+ return
80
+ }
81
+
58
82
  // healthInfo is an object with the following structure:
59
83
  // id: "service-1"
60
84
  // service: "service-1"
@@ -112,7 +136,7 @@ async function alerts (app, _opts) {
112
136
 
113
137
  const alert = await body.json()
114
138
 
115
- app.requestFlamegraphs({
139
+ app.sendFlamegraphs({
116
140
  workerIds: [workerId],
117
141
  alertId: alert.id
118
142
  }).catch(err => {
package/plugins/env.js CHANGED
@@ -26,7 +26,8 @@ const schema = {
26
26
  PLT_JWT_EXPIRATION_OFFSET_SEC: { type: 'number', default: 60 },
27
27
  PLT_UPDATES_RECONNECT_INTERVAL_SEC: { type: 'number', default: 1 },
28
28
  PLT_ELU_HEALTH_SIGNAL_THRESHOLD: { type: 'number', default: 0.8 },
29
- PLT_HEAP_HEALTH_SIGNAL_THRESHOLD: { type: ['number', 'string'], default: '4GB' }
29
+ PLT_HEAP_HEALTH_SIGNAL_THRESHOLD: { type: ['number', 'string'], default: '4GB' },
30
+ PLT_ALERTS_GRACE_PERIOD_SEC: { type: 'number', default: 30 }
30
31
  }
31
32
  }
32
33
 
@@ -1,207 +1,147 @@
1
1
  'use strict'
2
2
 
3
+ import { setTimeout as sleep } from 'node:timers/promises'
3
4
  import { request } from 'undici'
4
5
 
5
- export class Profiler {
6
- #workerId
7
- #type
8
- #duration
9
- #profileOptions
10
- #runtime
11
- #log
12
- #requests
13
- #isProfiling
14
- #onProfile
15
- #getProfileInterval
16
- #stopProfileTimeout
17
-
18
- constructor (options = {}) {
19
- const { type, duration, workerId, sourceMaps, app, onProfile } = options
20
-
21
- if (type !== 'cpu' && type !== 'heap') {
22
- throw new Error('Invalid Profiler type. Must be either "cpu" or "heap"')
23
- }
24
- if (typeof duration !== 'number') {
25
- throw new Error('Invalid Profiler duration. Must be a number')
26
- }
27
- if (typeof workerId !== 'string') {
28
- throw new Error('Invalid Worker ID. Must be a string')
29
- }
30
- if (!workerId.includes(':')) {
31
- throw new Error('Worker ID must include the service ID and worker index')
32
- }
33
- if (typeof onProfile !== 'function') {
34
- throw new Error('Invalid onProfile handler. Must be a function')
35
- }
36
-
37
- this.#type = type
38
- this.#duration = duration
39
- this.#workerId = workerId
40
- this.#onProfile = onProfile
6
+ async function flamegraphs (app, _opts) {
7
+ const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
8
+ const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
9
+ const flamegraphsELUThreshold = app.env.PLT_FLAMEGRAPHS_ELU_THRESHOLD
10
+ const flamegraphsGracePeriod = app.env.PLT_FLAMEGRAPHS_GRACE_PERIOD
11
+ const flamegraphsAttemptTimeout = app.env.PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT
12
+ const flamegraphsCacheCleanupInterval = app.env.PLT_FLAMEGRAPHS_CACHE_CLEANUP_INTERVAL
41
13
 
42
- this.#profileOptions = {
43
- type,
44
- durationMillis: duration,
45
- sourceMaps: sourceMaps ?? false
46
- }
14
+ const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
15
+ const eluThreshold = parseFloat(flamegraphsELUThreshold)
16
+ const gracePeriod = parseInt(flamegraphsGracePeriod)
17
+ const attemptTimeout = Math.min(parseInt(flamegraphsAttemptTimeout), durationMillis)
18
+ const maxAttempts = Math.ceil(durationMillis / attemptTimeout) + 1
19
+ const cacheCleanupInterval = parseInt(flamegraphsCacheCleanupInterval)
47
20
 
48
- this.#requests = []
49
- this.#isProfiling = false
21
+ let workerStartedListener = null
50
22
 
51
- this.#runtime = app.watt.runtime
52
- this.#log = app.log.child({
53
- workerId: this.#workerId,
54
- profilerType: this.#type
55
- })
56
- }
23
+ const startProfilingOnWorker = async (runtime, workerFullId, logContext = {}) => {
24
+ await sleep(gracePeriod)
57
25
 
58
- async requestProfile (request = {}) {
59
- process._rawDebug('--------REQUEST--------', request)
60
- request.timestamp ??= Date.now()
61
- this.#requests.push(request)
62
- this.#unscheduleStopProfiling()
26
+ // Get application details to read service-level sourceMaps setting
27
+ const appDetails = await runtime.getApplicationDetails(workerFullId)
28
+ const sourceMaps = appDetails.sourceMaps ?? false
63
29
 
64
- if (!this.#isProfiling) {
65
- this.#startProfilingLoop()
66
- }
67
- }
30
+ try {
31
+ // Start CPU profiling
32
+ await runtime.sendCommandToApplication(
33
+ workerFullId,
34
+ 'startProfiling',
35
+ { durationMillis, eluThreshold, type: 'cpu', sourceMaps }
36
+ )
68
37
 
69
- async stop () {
70
- if (this.#getProfileInterval) {
71
- clearInterval(this.#getProfileInterval)
72
- this.#getProfileInterval = null
73
- }
74
- if (this.#stopProfileTimeout) {
75
- clearTimeout(this.#stopProfileTimeout)
76
- this.#stopProfileTimeout = null
77
- }
78
- if (this.#isProfiling) {
79
- await this.#stopProfiling()
38
+ // Start HEAP profiling
39
+ await runtime.sendCommandToApplication(
40
+ workerFullId,
41
+ 'startProfiling',
42
+ { durationMillis, eluThreshold, type: 'heap', sourceMaps }
43
+ )
44
+ } catch (err) {
45
+ app.log.error({ err, ...logContext }, 'Failed to start profiling')
46
+ throw err
80
47
  }
81
48
  }
82
49
 
83
- async #startProfilingLoop () {
84
- try {
85
- await this.#startProfiling()
86
- } catch (err) {
87
- this.#log.error({ err }, 'Failed to start profiling')
88
- const requests = this.#getProfileRequests(Date.now())
89
- this.#onProfile(err, null, requests)
50
+ app.setupFlamegraphs = async () => {
51
+ if (isFlamegraphsDisabled) {
52
+ app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
90
53
  return
91
54
  }
92
55
 
93
- this.#getProfileInterval = setInterval(
94
- () => this.#processProfile(),
95
- this.#duration
96
- ).unref()
97
- }
56
+ app.log.info('Start profiling services')
98
57
 
99
- async #processProfile () {
100
- try {
101
- const profile = await this.#getProfile()
102
- const requests = this.#getProfileRequests(profile.timestamp)
103
- this.#onProfile(null, profile, requests)
104
- } catch (err) {
105
- this.#log.error({ err }, 'Failed to generate a profile')
106
- const requests = this.#getProfileRequests(Date.now())
107
- this.#onProfile(err, null, requests)
108
- }
58
+ const runtime = app.watt.runtime
59
+ const workers = await runtime.getWorkers()
109
60
 
110
- if (this.#requests.length === 0) {
111
- this.#scheduleStopProfiling()
61
+ const promises = []
62
+ for (const [workerFullId, workerInfo] of Object.entries(workers)) {
63
+ if (workerInfo.status === 'started') {
64
+ const promise = startProfilingOnWorker(runtime, workerFullId, { workerFullId })
65
+ promises.push(promise)
66
+ }
112
67
  }
113
- }
114
-
115
- #scheduleStopProfiling () {
116
- // Stop profiling after the duration/2 if there are no more requests
117
- this.#stopProfileTimeout = setTimeout(
118
- () => this.stop(),
119
- this.#duration / 2
120
- ).unref()
121
- }
122
68
 
123
- #unscheduleStopProfiling () {
124
- if (this.#stopProfileTimeout) {
125
- clearTimeout(this.#stopProfileTimeout)
126
- this.#stopProfileTimeout = null
69
+ const results = await Promise.allSettled(promises)
70
+ for (const result of results) {
71
+ if (result.status === 'rejected') {
72
+ app.log.error({ result }, 'Failed to start profiling')
73
+ }
127
74
  }
128
- }
129
-
130
- async #startProfiling () {
131
- this.#isProfiling = true
132
- this.#log.info('Starting profiling')
133
75
 
134
- await this.#runtime.sendCommandToApplication(
135
- this.#workerId, 'startProfiling', this.#profileOptions
136
- )
137
- }
76
+ // Listen for new workers starting and start profiling on them
77
+ workerStartedListener = ({ application, worker }) => {
78
+ if (isFlamegraphsDisabled) {
79
+ return
80
+ }
138
81
 
139
- async #stopProfiling () {
140
- this.#isProfiling = false
141
- this.#log.info('Stopping profiling')
82
+ const workerFullId = [application, worker].join(':')
83
+ app.log.info({ application, worker }, 'Starting profiling on new worker')
142
84
 
143
- try {
144
- await this.#runtime.sendCommandToApplication(
145
- this.#workerId, 'stopProfiling', this.#profileOptions
146
- )
147
- } catch (err) {
148
- // Ignore errors if the app is already closing
149
- this.#log.debug({ err }, 'Failed to stop profiling')
85
+ startProfilingOnWorker(runtime, workerFullId, { application, worker }).catch(() => {
86
+ // Error already logged in startProfilingOnWorker
87
+ })
150
88
  }
151
- }
152
-
153
- async #getProfile () {
154
- this.#log.info('Getting profile from worker')
155
-
156
- const [state, profile] = await Promise.all([
157
- this.#runtime.sendCommandToApplication(this.#workerId, 'getProfilingState', { type: this.#type }),
158
- this.#runtime.sendCommandToApplication(this.#workerId, 'getLastProfile', { type: this.#type })
159
- ])
160
- return { data: profile, timestamp: state.latestProfileTimestamp }
161
- }
89
+ runtime.on('application:worker:started', workerStartedListener)
162
90
 
163
- #getProfileRequests (profileTimestamp) {
164
- let processedIndex = 0
165
- for (let i = 0; i < this.#requests.length; i++) {
166
- if (this.#requests[i].timestamp <= profileTimestamp) {
167
- processedIndex = i + 1
168
- }
169
- }
170
- return this.#requests.splice(0, processedIndex)
91
+ setInterval(cleanupFlamegraphsCache, cacheCleanupInterval).unref()
171
92
  }
172
- }
173
-
174
- async function flamegraphs (app, _opts) {
175
- const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
176
- const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
177
-
178
- const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
179
-
180
- const profilers = {}
181
- const profilersConfigs = {}
182
93
 
183
- app.setupFlamegraphs = async () => {
184
- if (isFlamegraphsDisabled) {
185
- app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
186
- return
94
+ app.cleanupFlamegraphs = async () => {
95
+ if (workerStartedListener && app.watt?.runtime) {
96
+ app.watt.runtime.removeListener('application:worker:started', workerStartedListener)
97
+ workerStartedListener = null
187
98
  }
188
99
 
189
- const runtime = app.watt.runtime
190
- const { applications } = await runtime.getApplications()
191
-
192
- for (const application of applications) {
193
- const appDetails = await runtime.getApplicationDetails(application.id)
194
- const sourceMaps = appDetails.sourceMaps ?? false
195
- profilersConfigs[application.id] = { durationMillis, sourceMaps }
100
+ // Explicitly stop all active profiling sessions to avoid memory corruption
101
+ if (!isFlamegraphsDisabled && app.watt?.runtime) {
102
+ try {
103
+ const workers = await app.watt.runtime.getWorkers()
104
+ const stopPromises = []
105
+ for (const workerFullId of Object.keys(workers)) {
106
+ // Stop both CPU and heap profiling on each worker
107
+ stopPromises.push(
108
+ app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'cpu' })
109
+ .catch(err => {
110
+ // Ignore errors if profiling wasn't running
111
+ if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
112
+ app.log.warn({ err, workerFullId }, 'Failed to stop CPU profiling')
113
+ }
114
+ })
115
+ )
116
+ stopPromises.push(
117
+ app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'heap' })
118
+ .catch(err => {
119
+ // Ignore errors if profiling wasn't running
120
+ if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
121
+ app.log.warn({ err, workerFullId }, 'Failed to stop heap profiling')
122
+ }
123
+ })
124
+ )
125
+ }
126
+ await Promise.all(stopPromises)
127
+ // Small delay to ensure native cleanup completes
128
+ await sleep(100)
129
+ } catch (err) {
130
+ app.log.warn({ err }, 'Failed to stop profiling during cleanup')
131
+ }
196
132
  }
197
133
  }
198
134
 
199
- app.requestFlamegraphs = async (options = {}) => {
135
+ const profilesByWorkerId = {}
136
+
137
+ app.sendFlamegraphs = async (options = {}) => {
200
138
  if (isFlamegraphsDisabled) {
201
139
  app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
202
140
  return
203
141
  }
204
142
 
143
+ let { workerIds, alertId, profileType = 'cpu' } = options
144
+
205
145
  const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
206
146
  if (!scalerUrl) {
207
147
  app.log.error('No scaler URL found in ICC services, cannot send flamegraph')
@@ -210,91 +150,118 @@ async function flamegraphs (app, _opts) {
210
150
 
211
151
  const runtime = app.watt.runtime
212
152
 
213
- let { workerIds, alertId, profileType = 'cpu' } = options
214
-
215
- const servicesWorkers = {}
216
- const workers = await runtime.getWorkers()
217
- for (const workerId in workers) {
218
- const workerInfo = workers[workerId]
219
- const serviceId = workerInfo.application
220
-
221
- servicesWorkers[serviceId] ??= []
222
- servicesWorkers[serviceId].push(workerId)
153
+ if (!workerIds) {
154
+ const { applications } = await runtime.getApplications()
155
+ workerIds = applications.map(app => app.id)
223
156
  }
224
157
 
225
- workerIds ??= Object.keys(servicesWorkers)
226
-
227
- for (let workerId of workerIds) {
228
- const [serviceId, workerIndex] = workerId.split(':')
229
- if (workerIndex === undefined) {
230
- workerId = servicesWorkers[serviceId][0]
231
- }
232
-
233
- if (workerId === undefined) {
234
- app.log.error({ serviceId }, 'No worker found for an application')
235
- continue
236
- }
158
+ cleanupFlamegraphsCache()
237
159
 
160
+ const uploadPromises = workerIds.map(async (workerId) => {
161
+ const serviceId = workerId.split(':')[0]
238
162
  const profileKey = `${workerId}:${profileType}`
239
163
 
240
- let profiler = profilers[profileKey]
241
- if (!profiler) {
242
- const config = profilersConfigs[serviceId]
243
- profiler = new Profiler({
244
- app,
245
- workerId,
246
- type: profileType,
247
- duration: config.durationMillis,
248
- sourceMaps: config.sourceMaps,
249
- onProfile: createProfileHandler(scalerUrl, workerId, profileType)
250
- })
251
- profilers[profileKey] = profiler
164
+ let profile = profilesByWorkerId[profileKey]
165
+ if (profile !== undefined) {
166
+ if (alertId) {
167
+ app.log.info(
168
+ { workerId, alertId }, 'Flamegraph will be attached to the alert'
169
+ )
170
+ profile.waitingAlerts.push(alertId)
171
+ }
172
+
173
+ if (profile.flamegraphId === null) {
174
+ app.log.info({ workerId }, 'Waiting for flamegraph to be generated and sent')
175
+ return
176
+ }
252
177
  }
253
178
 
254
- profiler.requestProfile({ alertId })
255
- }
256
- }
179
+ if (profile === undefined) {
180
+ profile = {
181
+ type: profileType,
182
+ data: null,
183
+ timestamp: null,
184
+ flamegraphId: null,
185
+ waitingAlerts: []
186
+ }
187
+ profilesByWorkerId[profileKey] = profile
257
188
 
258
- function createProfileHandler (scalerUrl, workerId, profileType) {
259
- const serviceId = workerId.split(':')[0]
189
+ const result = await getServiceFlamegraph(workerId, profileType)
190
+ if (!result || !(result.data instanceof Uint8Array)) {
191
+ app.log.error({ workerId }, 'Failed to get profile from service')
192
+ delete profilesByWorkerId[profileKey]
193
+ return
194
+ }
260
195
 
261
- return async (err, profile, requests) => {
262
- if (err) {
263
- app.log.error({ err }, 'Failed to generate a profile')
264
- return
196
+ profile.data = result.data
197
+ profile.timestamp = result.timestamp
265
198
  }
266
199
 
267
- const alertIds = []
268
- for (const request of requests) {
269
- if (request.alertId) {
270
- alertIds.push(request.alertId)
200
+ if (profile.flamegraphId === null || !alertId) {
201
+ try {
202
+ const flamegraph = await sendServiceFlamegraph(
203
+ scalerUrl,
204
+ serviceId,
205
+ profile.data,
206
+ profileType,
207
+ alertId
208
+ )
209
+ profile.flamegraphId = flamegraph.id
210
+ } catch (err) {
211
+ app.log.error({ err, workerId, alertId, profileType }, 'Failed to send flamegraph')
212
+ delete profilesByWorkerId[profileKey]
213
+ return
271
214
  }
272
215
  }
273
216
 
274
- process._rawDebug('--------ALERT IDS--------', alertIds)
275
-
276
- try {
277
- const alertId = alertIds.shift()
278
- const flamegraph = await sendServiceFlamegraph(
217
+ const waitingAlerts = profile.waitingAlerts
218
+ if (waitingAlerts.length > 0) {
219
+ profile.waitingAlerts = []
220
+ await _attachFlamegraphToAlerts(
279
221
  scalerUrl,
280
222
  serviceId,
223
+ profile.flamegraphId,
281
224
  profile.data,
282
- profileType,
283
- alertId
225
+ profile.type,
226
+ waitingAlerts
284
227
  )
228
+ }
229
+ })
285
230
 
286
- if (alertIds.length > 0) {
287
- await _attachFlamegraphToAlerts(
288
- scalerUrl,
289
- serviceId,
290
- flamegraph.id,
291
- profile.data,
292
- profileType,
293
- alertIds
294
- )
231
+ await Promise.all(uploadPromises)
232
+ }
233
+
234
+ async function getServiceFlamegraph (workerId, profileType, attempt = 1) {
235
+ const runtime = app.watt.runtime
236
+
237
+ app.log.info({ workerId, attempt, maxAttempts, attemptTimeout }, 'Getting profile from worker')
238
+
239
+ try {
240
+ const [state, profile] = await Promise.all([
241
+ runtime.sendCommandToApplication(workerId, 'getProfilingState', { type: profileType }),
242
+ runtime.sendCommandToApplication(workerId, 'getLastProfile', { type: profileType })
243
+ ])
244
+ return { data: profile, timestamp: state.latestProfileTimestamp }
245
+ } catch (err) {
246
+ if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
247
+ app.log.info(
248
+ { workerId, attempt, maxAttempts, attemptTimeout },
249
+ 'No profile available for the service. Waiting for profiling to complete.'
250
+ )
251
+ if (attempt <= maxAttempts) {
252
+ await sleep(attemptTimeout)
253
+ return getServiceFlamegraph(workerId, profileType, attempt + 1)
254
+ }
255
+ } else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
256
+ app.log.info({ workerId }, 'ELU low, CPU profiling not active')
257
+ } else {
258
+ app.log.warn({ err, workerId }, 'Failed to get profile from a worker')
259
+
260
+ const [serviceId, workerIndex] = workerId.split(':')
261
+ if (workerIndex) {
262
+ app.log.warn('Worker not available, trying to get profile from another worker')
263
+ return getServiceFlamegraph(serviceId, profileType)
295
264
  }
296
- } catch (err) {
297
- app.log.error({ err, workerId }, 'Failed to send flamegraph')
298
265
  }
299
266
  }
300
267
  }
@@ -302,7 +269,7 @@ async function flamegraphs (app, _opts) {
302
269
  async function sendServiceFlamegraph (scalerUrl, serviceId, profile, profileType, alertId) {
303
270
  const podId = app.instanceId
304
271
  const url = `${scalerUrl}/pods/${podId}/services/${serviceId}/flamegraph`
305
- app.log.info({ serviceId, podId, profileType, alertId }, 'Sending flamegraph')
272
+ app.log.info({ serviceId, podId, profileType }, 'Sending flamegraph')
306
273
 
307
274
  const query = { profileType }
308
275
  if (alertId) {
@@ -326,14 +293,8 @@ async function flamegraphs (app, _opts) {
326
293
  throw new Error(`Failed to send flamegraph: ${error}`)
327
294
  }
328
295
 
329
- const flamegraph = await body.json()
330
-
331
- app.log.info(
332
- { serviceId, podId, profileType, flamegraph },
333
- 'Flamegraph successfully stored'
334
- )
335
-
336
- return flamegraph
296
+ const response = await body.json()
297
+ return response
337
298
  }
338
299
 
339
300
  // Function that supports ICC that doesn't have attach flamegraph API
@@ -406,10 +367,15 @@ async function flamegraphs (app, _opts) {
406
367
  }
407
368
  }
408
369
 
409
- app.cleanupFlamegraphs = async () => {
410
- // Stop all tracked profilers in parallel
411
- const stopPromises = Object.values(profilers).map(profiler => profiler.stop())
412
- await Promise.all(stopPromises)
370
+ function cleanupFlamegraphsCache () {
371
+ const now = Date.now()
372
+
373
+ for (const profileKey of Object.keys(profilesByWorkerId)) {
374
+ const timestamp = profilesByWorkerId[profileKey]?.timestamp
375
+ if (timestamp && now - timestamp > durationMillis) {
376
+ delete profilesByWorkerId[profileKey]
377
+ }
378
+ }
413
379
  }
414
380
  }
415
381
 
@@ -183,14 +183,12 @@ async function healthSignals (app, _opts) {
183
183
  app.log.error({ error }, 'Failed to send health signals to scaler')
184
184
  }
185
185
 
186
- const response = await body.json()
186
+ const alert = await body.json()
187
187
 
188
- process._rawDebug('--------SEND HEALTH SIGNALS--------', response)
189
-
190
- app.requestFlamegraphs({
188
+ app.sendFlamegraphs({
191
189
  serviceIds: [serviceId],
192
190
  workerIds: [workerId],
193
- alertId: response.alertId
191
+ alertId: alert.id
194
192
  }).catch(err => {
195
193
  app.log.error({ err }, 'Failed to send a flamegraph')
196
194
  })