@platformatic/watt-extra 1.6.2 → 1.6.3-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@platformatic/watt-extra",
3
- "version": "1.6.2",
3
+ "version": "1.6.3-alpha.1",
4
4
  "description": "The Platformatic runtime manager",
5
5
  "type": "module",
6
6
  "scripts": {
@@ -19,10 +19,10 @@
19
19
  },
20
20
  "devDependencies": {
21
21
  "@fastify/websocket": "^11.1.0",
22
- "@platformatic/composer": "^3.22.0",
23
- "@platformatic/next": "^3.22.0",
24
- "@platformatic/node": "^3.22.0",
25
- "@platformatic/service": "^3.22.0",
22
+ "@platformatic/composer": "^3.25.0",
23
+ "@platformatic/next": "^3.25.0",
24
+ "@platformatic/node": "^3.25.0",
25
+ "@platformatic/service": "^3.25.0",
26
26
  "atomic-sleep": "^1.0.0",
27
27
  "borp": "^0.21.0",
28
28
  "eslint": "9",
@@ -30,16 +30,16 @@
30
30
  "fastify-plugin": "^5.0.1",
31
31
  "neostandard": "^0.12.0",
32
32
  "next": "^16.0.0",
33
- "platformatic": "^3.22.0",
33
+ "platformatic": "^3.25.0",
34
34
  "pprof-format": "^2.1.0",
35
35
  "why-is-node-running": "^2.3.0"
36
36
  },
37
37
  "dependencies": {
38
38
  "@datadog/pprof": "^5.9.0",
39
39
  "@fastify/error": "^4.2.0",
40
- "@platformatic/foundation": "^3.22.0",
41
- "@platformatic/runtime": "^3.22.0",
42
- "@platformatic/wattpm-pprof-capture": "^3.22.0",
40
+ "@platformatic/foundation": "^3.25.0",
41
+ "@platformatic/runtime": "^3.25.0",
42
+ "@platformatic/wattpm-pprof-capture": "^3.25.0",
43
43
  "avvio": "^9.1.0",
44
44
  "chalk": "^4.1.2",
45
45
  "commist": "^3.2.0",
package/plugins/alerts.js CHANGED
@@ -3,9 +3,9 @@ import { request } from 'undici'
3
3
  async function alerts (app, _opts) {
4
4
  const healthCache = [] // It's OK to have this in memory, this is per-pod.
5
5
  const podHealthWindow =
6
- app.instanceConfig?.config?.scaler?.podHealthWindow || 60 * 1000
6
+ app.instanceConfig?.scaler?.podHealthWindow || 60 * 1000
7
7
  const alertRetentionWindow =
8
- app.instanceConfig?.config?.scaler?.alertRetentionWindow || 10 * 1000
8
+ app.instanceConfig?.scaler?.alertRetentionWindow || 10 * 1000
9
9
 
10
10
  const lastServicesAlertTime = {}
11
11
 
@@ -40,6 +40,7 @@ async function alerts (app, _opts) {
40
40
  }
41
41
 
42
42
  const timestamp = Date.now()
43
+ const workerId = healthInfo.id
43
44
  const serviceId = healthInfo.application
44
45
  const healthWithTimestamp = { ...healthInfo, timestamp, service: serviceId }
45
46
  delete healthWithTimestamp.healthConfig // we don't need to store this
@@ -111,14 +112,12 @@ async function alerts (app, _opts) {
111
112
 
112
113
  const alert = await body.json()
113
114
 
114
- try {
115
- await app.sendFlamegraphs({
116
- serviceIds: [serviceId],
117
- alertId: alert.id
118
- })
119
- } catch (err) {
115
+ app.sendFlamegraphs({
116
+ workerIds: [workerId],
117
+ alertId: alert.id
118
+ }).catch(err => {
120
119
  app.log.error({ err }, 'Failed to send a flamegraph')
121
- }
120
+ })
122
121
  }
123
122
  }
124
123
 
package/plugins/env.js CHANGED
@@ -21,6 +21,7 @@ const schema = {
21
21
  PLT_FLAMEGRAPHS_INTERVAL_SEC: { type: 'number', default: 60 },
22
22
  PLT_FLAMEGRAPHS_ELU_THRESHOLD: { type: 'number', default: 0.4 },
23
23
  PLT_FLAMEGRAPHS_GRACE_PERIOD: { type: 'number', default: 3000 },
24
+ PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT: { type: 'number', default: 10000 },
24
25
  PLT_JWT_EXPIRATION_OFFSET_SEC: { type: 'number', default: 60 },
25
26
  PLT_UPDATES_RECONNECT_INTERVAL_SEC: { type: 'number', default: 1 },
26
27
  PLT_ELU_HEALTH_SIGNAL_THRESHOLD: { type: 'number', default: 0.8 },
@@ -8,10 +8,13 @@ async function flamegraphs (app, _opts) {
8
8
  const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
9
9
  const flamegraphsELUThreshold = app.env.PLT_FLAMEGRAPHS_ELU_THRESHOLD
10
10
  const flamegraphsGracePeriod = app.env.PLT_FLAMEGRAPHS_GRACE_PERIOD
11
+ const flamegraphsAttemptTimeout = app.env.PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT
11
12
 
12
13
  const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
13
14
  const eluThreshold = parseFloat(flamegraphsELUThreshold)
14
15
  const gracePeriod = parseInt(flamegraphsGracePeriod)
16
+ const attemptTimeout = Math.min(parseInt(flamegraphsAttemptTimeout), durationMillis)
17
+ const maxAttempts = Math.ceil(durationMillis / attemptTimeout) + 1
15
18
 
16
19
  let workerStartedListener = null
17
20
 
@@ -125,13 +128,15 @@ async function flamegraphs (app, _opts) {
125
128
  }
126
129
  }
127
130
 
131
+ const profilesByWorkerId = {}
132
+
128
133
  app.sendFlamegraphs = async (options = {}) => {
129
134
  if (isFlamegraphsDisabled) {
130
135
  app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
131
136
  return
132
137
  }
133
138
 
134
- let { serviceIds, alertId, profileType = 'cpu' } = options
139
+ let { workerIds, alertId, profileType = 'cpu' } = options
135
140
 
136
141
  const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
137
142
  if (!scalerUrl) {
@@ -139,61 +144,163 @@ async function flamegraphs (app, _opts) {
139
144
  throw new Error('No scaler URL found in ICC services, cannot send flamegraph')
140
145
  }
141
146
 
142
- const podId = app.instanceId
143
147
  const runtime = app.watt.runtime
144
148
 
145
- if (!serviceIds) {
149
+ if (!workerIds) {
146
150
  const { applications } = await runtime.getApplications()
147
- serviceIds = applications.map(app => app.id)
151
+ workerIds = applications.map(app => app.id)
148
152
  }
149
153
 
150
- const authHeaders = await app.getAuthorizationHeader()
154
+ cleanupFlamegraphsCache()
151
155
 
152
- const uploadPromises = serviceIds.map(async (serviceId) => {
153
- try {
154
- const profile = await runtime.sendCommandToApplication(serviceId, 'getLastProfile', { type: profileType })
155
- if (!profile || !(profile instanceof Uint8Array)) {
156
- app.log.error({ serviceId }, 'Failed to get profile from service')
156
+ const uploadPromises = workerIds.map(async (workerId) => {
157
+ let profile = profilesByWorkerId[workerId]
158
+ if (profile?.flamegraphId) {
159
+ const { flamegraphId } = profile
160
+ try {
161
+ await attachFlamegraphToAlerts(scalerUrl, flamegraphId, [alertId])
157
162
  return
163
+ } catch (err) {
164
+ if (err.code === 'PLT_ATTACH_FLAMEGRAPH_MULTIPLE_ALERTS_NOT_SUPPORTED') {
165
+ app.log.warn(
166
+ 'Attaching flamegraph multiple alerts is not supported by the scaler.' +
167
+ ' Please upgrade to the latest ICC version to use this feature.'
168
+ )
169
+ } else {
170
+ app.log.error({ err, workerId, alertId, flamegraphId }, 'Failed to attach flamegraph to alert')
171
+ }
158
172
  }
173
+ }
159
174
 
160
- const url = `${scalerUrl}/pods/${podId}/services/${serviceId}/flamegraph`
175
+ if (!profile) {
176
+ profile = await getServiceFlamegraph(workerId, profileType)
177
+ if (!profile || !(profile.data instanceof Uint8Array)) {
178
+ app.log.error({ workerId }, 'Failed to get profile from service')
179
+ return
180
+ }
181
+ }
161
182
 
162
- app.log.info({ serviceId, podId, profileType }, 'Sending flamegraph')
183
+ profilesByWorkerId[workerId] = profile
163
184
 
164
- const query = { profileType }
165
- if (alertId) {
166
- query.alertId = alertId
167
- }
185
+ const serviceId = workerId.split(':')[0]
168
186
 
169
- const { statusCode, body } = await request(url, {
170
- method: 'POST',
171
- headers: {
172
- 'Content-Type': 'application/octet-stream',
173
- ...authHeaders
174
- },
175
- query,
176
- body: profile
177
- })
178
-
179
- if (statusCode !== 200) {
180
- const error = await body.text()
181
- app.log.error({ error }, 'Failed to send flamegraph')
182
- throw new Error(`Failed to send flamegraph: ${error}`)
183
- }
187
+ try {
188
+ const flamegraph = await sendServiceFlamegraph(
189
+ scalerUrl,
190
+ serviceId,
191
+ profile.data,
192
+ profileType,
193
+ alertId
194
+ )
195
+ profile.flamegraphId = flamegraph.id
184
196
  } catch (err) {
185
- if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
186
- app.log.info({ serviceId, podId }, 'No profile available for the service')
187
- } else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
188
- app.log.info({ serviceId, podId }, 'ELU low, CPU profiling not active')
189
- } else {
190
- app.log.warn({ err, serviceId, podId }, 'Failed to send flamegraph from service')
191
- }
197
+ app.log.error({ err, workerId, alertId, profileType }, 'Failed to send flamegraph')
192
198
  }
193
199
  })
194
200
 
195
201
  await Promise.all(uploadPromises)
196
202
  }
203
+
204
+ async function getServiceFlamegraph (workerId, profileType, attempt = 1) {
205
+ const runtime = app.watt.runtime
206
+
207
+ try {
208
+ const [state, profile] = await Promise.all([
209
+ runtime.sendCommandToApplication(workerId, 'getProfilingState', { type: profileType }),
210
+ runtime.sendCommandToApplication(workerId, 'getLastProfile', { type: profileType })
211
+ ])
212
+ return { data: profile, timestamp: state.latestProfileTimestamp }
213
+ } catch (err) {
214
+ if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
215
+ app.log.info(
216
+ { workerId, attempt, maxAttempts, attemptTimeout },
217
+ 'No profile available for the service. Waiting for profiling to complete.'
218
+ )
219
+ if (attempt <= maxAttempts) {
220
+ await sleep(attemptTimeout)
221
+ return getServiceFlamegraph(workerId, profileType, attempt + 1)
222
+ }
223
+ } else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
224
+ app.log.info({ workerId }, 'ELU low, CPU profiling not active')
225
+ } else {
226
+ app.log.warn({ err, workerId }, 'Failed to get profile from a worker')
227
+
228
+ const [serviceId, workerIndex] = workerId.split(':')
229
+ if (workerIndex) {
230
+ app.log.warn('Worker not available, trying to get profile from another worker')
231
+ return getServiceFlamegraph(serviceId, profileType)
232
+ }
233
+ }
234
+ }
235
+ }
236
+
237
+ async function sendServiceFlamegraph (scalerUrl, serviceId, profile, profileType, alertId) {
238
+ const podId = app.instanceId
239
+ const url = `${scalerUrl}/pods/${podId}/services/${serviceId}/flamegraph`
240
+ app.log.info({ serviceId, podId, profileType }, 'Sending flamegraph')
241
+
242
+ const query = { profileType }
243
+ if (alertId) {
244
+ query.alertId = alertId
245
+ }
246
+
247
+ const authHeaders = await app.getAuthorizationHeader()
248
+ const { statusCode, body } = await request(url, {
249
+ method: 'POST',
250
+ headers: {
251
+ 'Content-Type': 'application/octet-stream',
252
+ ...authHeaders
253
+ },
254
+ query,
255
+ body: profile
256
+ })
257
+
258
+ if (statusCode !== 200) {
259
+ const error = await body.text()
260
+ app.log.error({ error }, 'Failed to send flamegraph')
261
+ throw new Error(`Failed to send flamegraph: ${error}`)
262
+ }
263
+
264
+ const response = await body.json()
265
+ return response
266
+ }
267
+
268
+ async function attachFlamegraphToAlerts (scalerUrl, flamegraphId, alertIds) {
269
+ const url = `${scalerUrl}/flamegraphs/${flamegraphId}/alerts`
270
+ app.log.info({ flamegraphId, alerts: alertIds }, 'Attaching flamegraph to alerts')
271
+
272
+ const authHeaders = await app.getAuthorizationHeader()
273
+ const { statusCode, body } = await request(url, {
274
+ method: 'POST',
275
+ headers: {
276
+ 'Content-Type': 'application/json',
277
+ ...authHeaders
278
+ },
279
+ body: JSON.stringify({ alertIds })
280
+ })
281
+
282
+ if (statusCode !== 200) {
283
+ const error = await body.text()
284
+ if (statusCode === 404 && error.includes('Route POST')) {
285
+ const err = new Error('Attaching flamegraph multiple alerts is not supported by the scaler')
286
+ err.code = 'PLT_ATTACH_FLAMEGRAPH_MULTIPLE_ALERTS_NOT_SUPPORTED'
287
+ throw err
288
+ }
289
+
290
+ throw new Error(`Failed to attach flamegraph to alerts: ${error}`)
291
+ }
292
+ }
293
+
294
+ function cleanupFlamegraphsCache () {
295
+ const now = Date.now()
296
+
297
+ for (const workerId of Object.keys(profilesByWorkerId)) {
298
+ const { timestamp } = profilesByWorkerId[workerId]
299
+ if (now - timestamp > durationMillis) {
300
+ delete profilesByWorkerId[workerId]
301
+ }
302
+ }
303
+ }
197
304
  }
198
305
 
199
306
  export default flamegraphs
@@ -80,6 +80,7 @@ async function healthSignals (app, _opts) {
80
80
  }
81
81
 
82
82
  const {
83
+ id: workerId,
83
84
  application: serviceId,
84
85
  currentHealth,
85
86
  healthSignals
@@ -125,13 +126,13 @@ async function healthSignals (app, _opts) {
125
126
  }
126
127
 
127
128
  if (healthSignals.length > 0) {
128
- await sendHealthSignalsWithTimeout(serviceId, healthSignals)
129
+ await sendHealthSignalsWithTimeout(serviceId, workerId, healthSignals)
129
130
  }
130
131
  })
131
132
  }
132
133
  app.setupHealthSignals = setupHealthSignals
133
134
 
134
- async function sendHealthSignalsWithTimeout (serviceId, signals) {
135
+ async function sendHealthSignalsWithTimeout (serviceId, workerId, signals) {
135
136
  signalsCaches[serviceId] ??= new HealthSignalsCache()
136
137
  servicesSendingStatuses[serviceId] ??= false
137
138
 
@@ -148,7 +149,7 @@ async function healthSignals (app, _opts) {
148
149
 
149
150
  try {
150
151
  const signals = signalsCache.getAll()
151
- await sendHealthSignals(serviceId, signals, metrics)
152
+ await sendHealthSignals(serviceId, workerId, signals, metrics)
152
153
  } catch (err) {
153
154
  app.log.error({ err }, 'Failed to send health signals to scaler')
154
155
  }
@@ -156,7 +157,7 @@ async function healthSignals (app, _opts) {
156
157
  }
157
158
  }
158
159
 
159
- async function sendHealthSignals (serviceId, signals, metrics) {
160
+ async function sendHealthSignals (serviceId, workerId, signals, metrics) {
160
161
  const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
161
162
  const applicationId = app.instanceConfig?.applicationId
162
163
  const authHeaders = await app.getAuthorizationHeader()
@@ -184,11 +185,13 @@ async function healthSignals (app, _opts) {
184
185
 
185
186
  const alert = await body.json()
186
187
 
187
- try {
188
- await app.sendFlamegraphs({ serviceIds: [serviceId], alertId: alert.id })
189
- } catch (err) {
188
+ app.sendFlamegraphs({
189
+ serviceIds: [serviceId],
190
+ workerIds: [workerId],
191
+ alertId: alert.id
192
+ }).catch(err => {
190
193
  app.log.error({ err }, 'Failed to send a flamegraph')
191
- }
194
+ })
192
195
  }
193
196
  }
194
197
 
package/plugins/update.js CHANGED
@@ -23,14 +23,14 @@ async function updatePlugin (app) {
23
23
  // Handle trigger-flamegraph command from ICC
24
24
  if (command === 'trigger-flamegraph') {
25
25
  app.log.info({ command }, 'Received trigger-flamegraph command from ICC')
26
- await app.sendFlamegraphs({ profileType: 'cpu' })
26
+ app.sendFlamegraphs({ profileType: 'cpu' })
27
27
  return
28
28
  }
29
29
 
30
30
  // Handle trigger-heapprofile command from ICC
31
31
  if (command === 'trigger-heapprofile') {
32
32
  app.log.info({ command }, 'Received trigger-heapprofile command from ICC')
33
- await app.sendFlamegraphs({ profileType: 'heap' })
33
+ app.sendFlamegraphs({ profileType: 'heap' })
34
34
  return
35
35
  }
36
36
 
@@ -378,10 +378,8 @@ test('should respect alert retention window', async (t) => {
378
378
  const icc = await startICC(t, {
379
379
  applicationId,
380
380
  applicationName,
381
- iccConfig: {
382
- scaler: {
383
- alertRetentionWindow: 500
384
- }
381
+ scaler: {
382
+ alertRetentionWindow: 500
385
383
  },
386
384
  processAlerts: (req) => {
387
385
  const alert = req.body
@@ -740,3 +738,213 @@ test('should handle old runtime (< 3.18.0) health events', async (t) => {
740
738
  assert.deepStrictEqual(alertReceived.alert.currentHealth, healthInfo.currentHealth)
741
739
  assert.strictEqual(alertReceived.alert.healthConfig, undefined, 'healthConfig should be deleted from alert')
742
740
  })
741
+
742
+ test('should attach one flamegraph to multiple alerts', async (t) => {
743
+ const applicationName = 'test-app'
744
+ const applicationId = randomUUID()
745
+ const applicationPath = join(__dirname, 'fixtures', 'service-1')
746
+
747
+ const receivedAlerts = []
748
+ const receivedFlamegraphs = []
749
+ const receivedAttachedFlamegraphs = []
750
+
751
+ const getAuthorizationHeader = async (headers) => {
752
+ return { ...headers, authorization: 'Bearer test-token' }
753
+ }
754
+
755
+ const icc = await startICC(t, {
756
+ applicationId,
757
+ applicationName,
758
+ scaler: {
759
+ podHealthWindow: 1,
760
+ alertRetentionWindow: 1
761
+ },
762
+ processAlerts: (req) => {
763
+ assert.equal(req.headers.authorization, 'Bearer test-token')
764
+ const alert = req.body
765
+ alert.id = `alert-${receivedAlerts.length + 1}`
766
+ receivedAlerts.push(alert)
767
+ return alert
768
+ },
769
+ processFlamegraphs: (req) => {
770
+ assert.strictEqual(req.headers.authorization, 'Bearer test-token')
771
+ const flamegraphId = `flamegraph-${receivedFlamegraphs.length + 1}`
772
+ const alertId = req.query.alertId
773
+ receivedFlamegraphs.push({ id: flamegraphId, alertId })
774
+ return { id: flamegraphId }
775
+ },
776
+ attachFlamegraphToAlerts: (req) => {
777
+ assert.strictEqual(req.headers.authorization, 'Bearer test-token')
778
+ const flamegraphId = req.params.flamegraphId
779
+ const { alertIds } = req.body
780
+ receivedAttachedFlamegraphs.push({ flamegraphId, alertIds })
781
+ return {}
782
+ }
783
+ })
784
+
785
+ setUpEnvironment({
786
+ PLT_APP_NAME: applicationName,
787
+ PLT_APP_DIR: applicationPath,
788
+ PLT_ICC_URL: 'http://127.0.0.1:3000',
789
+ PLT_DISABLE_FLAMEGRAPHS: false,
790
+ PLT_FLAMEGRAPHS_INTERVAL_SEC: 5,
791
+ PLT_FLAMEGRAPHS_ELU_THRESHOLD: 0
792
+ })
793
+
794
+ const app = await start()
795
+ app.getAuthorizationHeader = getAuthorizationHeader
796
+
797
+ t.after(async () => {
798
+ await app.close()
799
+ await icc.close()
800
+ })
801
+
802
+ // Wait for the first flamegraph to be generated
803
+ await sleep(5000)
804
+
805
+ // Manually trigger health event with unhealthy state
806
+ const healthInfo = {
807
+ id: 'main:0',
808
+ application: 'main',
809
+ currentHealth: {
810
+ elu: 0.995,
811
+ heapUsed: 76798040,
812
+ heapTotal: 99721216
813
+ },
814
+ unhealthy: true,
815
+ healthConfig: {
816
+ enabled: true,
817
+ interval: 1000,
818
+ gracePeriod: 1000,
819
+ maxUnhealthyChecks: 10,
820
+ maxELU: 0.99,
821
+ maxHeapUsed: 0.99,
822
+ maxHeapTotal: 4294967296
823
+ }
824
+ }
825
+
826
+ emitHealthEvent(app, healthInfo)
827
+ await sleep(1000)
828
+ emitHealthEvent(app, healthInfo)
829
+
830
+ // Wait for flamegraphs to be sent
831
+ await sleep(1000)
832
+
833
+ assert.strictEqual(receivedAlerts.length, 2)
834
+ const alert1 = receivedAlerts[0]
835
+ const alert2 = receivedAlerts[1]
836
+ assert.strictEqual(alert1.id, 'alert-1')
837
+ assert.strictEqual(alert2.id, 'alert-2')
838
+
839
+ assert.strictEqual(receivedFlamegraphs.length, 1)
840
+ const flamegraph = receivedFlamegraphs[0]
841
+ assert.strictEqual(flamegraph.id, 'flamegraph-1')
842
+ assert.strictEqual(flamegraph.alertId, 'alert-1')
843
+
844
+ assert.strictEqual(receivedAttachedFlamegraphs.length, 1)
845
+ const attachedFlamegraph = receivedAttachedFlamegraphs[0]
846
+ assert.strictEqual(attachedFlamegraph.flamegraphId, 'flamegraph-1')
847
+ assert.deepStrictEqual(attachedFlamegraph.alertIds, ['alert-2'])
848
+ })
849
+
850
+ test('should send flamegraphs if attaching fails', async (t) => {
851
+ const applicationName = 'test-app'
852
+ const applicationId = randomUUID()
853
+ const applicationPath = join(__dirname, 'fixtures', 'service-1')
854
+
855
+ const receivedAlerts = []
856
+ const receivedFlamegraphs = []
857
+
858
+ const getAuthorizationHeader = async (headers) => {
859
+ return { ...headers, authorization: 'Bearer test-token' }
860
+ }
861
+
862
+ const icc = await startICC(t, {
863
+ applicationId,
864
+ applicationName,
865
+ scaler: {
866
+ podHealthWindow: 1,
867
+ alertRetentionWindow: 1
868
+ },
869
+ processAlerts: (req) => {
870
+ assert.equal(req.headers.authorization, 'Bearer test-token')
871
+ const alert = req.body
872
+ alert.id = `alert-${receivedAlerts.length + 1}`
873
+ receivedAlerts.push(alert)
874
+ return alert
875
+ },
876
+ processFlamegraphs: (req) => {
877
+ assert.strictEqual(req.headers.authorization, 'Bearer test-token')
878
+ const flamegraphId = `flamegraph-${receivedFlamegraphs.length + 1}`
879
+ const alertId = req.query.alertId
880
+ receivedFlamegraphs.push({ id: flamegraphId, alertId })
881
+ return { id: flamegraphId }
882
+ },
883
+ attachFlamegraphToAlerts: (req) => {
884
+ throw new Error('Failed to attach flamegraph')
885
+ }
886
+ })
887
+
888
+ setUpEnvironment({
889
+ PLT_APP_NAME: applicationName,
890
+ PLT_APP_DIR: applicationPath,
891
+ PLT_ICC_URL: 'http://127.0.0.1:3000',
892
+ PLT_DISABLE_FLAMEGRAPHS: false,
893
+ PLT_FLAMEGRAPHS_INTERVAL_SEC: 5,
894
+ PLT_FLAMEGRAPHS_ELU_THRESHOLD: 0
895
+ })
896
+
897
+ const app = await start()
898
+ app.getAuthorizationHeader = getAuthorizationHeader
899
+
900
+ t.after(async () => {
901
+ await app.close()
902
+ await icc.close()
903
+ })
904
+
905
+ // Wait for the first flamegraph to be generated
906
+ await sleep(5000)
907
+
908
+ // Manually trigger health event with unhealthy state
909
+ const healthInfo = {
910
+ id: 'main:0',
911
+ application: 'main',
912
+ currentHealth: {
913
+ elu: 0.995,
914
+ heapUsed: 76798040,
915
+ heapTotal: 99721216
916
+ },
917
+ unhealthy: true,
918
+ healthConfig: {
919
+ enabled: true,
920
+ interval: 1000,
921
+ gracePeriod: 1000,
922
+ maxUnhealthyChecks: 10,
923
+ maxELU: 0.99,
924
+ maxHeapUsed: 0.99,
925
+ maxHeapTotal: 4294967296
926
+ }
927
+ }
928
+
929
+ emitHealthEvent(app, healthInfo)
930
+ await sleep(1000)
931
+ emitHealthEvent(app, healthInfo)
932
+
933
+ // Wait for flamegraphs to be sent
934
+ await sleep(1000)
935
+
936
+ assert.strictEqual(receivedAlerts.length, 2)
937
+ const alert1 = receivedAlerts[0]
938
+ const alert2 = receivedAlerts[1]
939
+ assert.strictEqual(alert1.id, 'alert-1')
940
+ assert.strictEqual(alert2.id, 'alert-2')
941
+
942
+ assert.strictEqual(receivedFlamegraphs.length, 2)
943
+ const flamegraph1 = receivedFlamegraphs[0]
944
+ assert.strictEqual(flamegraph1.id, 'flamegraph-1')
945
+ assert.strictEqual(flamegraph1.alertId, 'alert-1')
946
+
947
+ const flamegraph2 = receivedFlamegraphs[1]
948
+ assert.strictEqual(flamegraph2.id, 'flamegraph-2')
949
+ assert.strictEqual(flamegraph2.alertId, 'alert-2')
950
+ })
package/test/helper.js CHANGED
@@ -199,6 +199,9 @@ async function startICC (t, opts = {}) {
199
199
  icc.post('/pods/:podId/services/:serviceId/flamegraph', async (req) => {
200
200
  return opts.processFlamegraphs?.(req)
201
201
  })
202
+ icc.post('/flamegraphs/:flamegraphId/alerts', async (req) => {
203
+ return opts.attachFlamegraphToAlerts?.(req)
204
+ })
202
205
  }, { prefix: '/scaler' })
203
206
 
204
207
  // Cron
@@ -35,7 +35,7 @@ function setupMockIccServer (wss, receivedMessages, validateAuth = false) {
35
35
  return { waitForClientSubscription, getWs: () => ws }
36
36
  }
37
37
 
38
- function createMockApp (port, includeScalerUrl = true) {
38
+ function createMockApp (port, includeScalerUrl = true, env = {}) {
39
39
  const eventListeners = new Map()
40
40
 
41
41
  const mockWatt = {
@@ -100,7 +100,9 @@ function createMockApp (port, includeScalerUrl = true) {
100
100
  PLT_DISABLE_FLAMEGRAPHS: false,
101
101
  PLT_FLAMEGRAPHS_INTERVAL_SEC: 1,
102
102
  PLT_FLAMEGRAPHS_ELU_THRESHOLD: 0,
103
- PLT_FLAMEGRAPHS_GRACE_PERIOD: 0
103
+ PLT_FLAMEGRAPHS_GRACE_PERIOD: 0,
104
+ PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT: 1000,
105
+ ...env
104
106
  },
105
107
  watt: mockWatt
106
108
  }
@@ -333,15 +335,15 @@ test('sendFlamegraphs should handle missing profile data', async (t) => {
333
335
  equal(errors.length, 2, 'Should log errors for both services with missing profiles')
334
336
  })
335
337
 
336
- test('sendFlamegraphs should filter by serviceIds when provided', async (t) => {
338
+ test('sendFlamegraphs should filter by workerIds when provided', async (t) => {
337
339
  setUpEnvironment()
338
340
 
339
341
  const app = createMockApp(port + 12)
340
342
  const getProfileCalls = []
341
343
 
342
- app.watt.runtime.sendCommandToApplication = async (serviceId, command) => {
344
+ app.watt.runtime.sendCommandToApplication = async (workerId, command) => {
343
345
  if (command === 'getLastProfile') {
344
- getProfileCalls.push(serviceId)
346
+ getProfileCalls.push(workerId)
345
347
  return new Uint8Array([1, 2, 3])
346
348
  }
347
349
  return { success: false }
@@ -362,10 +364,49 @@ test('sendFlamegraphs should filter by serviceIds when provided', async (t) => {
362
364
  t.after(() => server.close())
363
365
 
364
366
  await flamegraphsPlugin(app)
365
- await app.sendFlamegraphs({ serviceIds: ['service-1'] })
367
+ await app.sendFlamegraphs({ workerIds: ['service-1:0'] })
366
368
 
367
369
  equal(getProfileCalls.length, 1, 'Should only request profile for specified service')
368
- equal(getProfileCalls[0], 'service-1', 'Should request profile for service-1')
370
+ equal(getProfileCalls[0], 'service-1:0', 'Should request profile for service-1')
371
+ })
372
+
373
+ test('sendFlamegraphs should try to get the profile from a service if worker is not available', async (t) => {
374
+ setUpEnvironment()
375
+
376
+ const app = createMockApp(port + 12)
377
+ const getProfileCalls = []
378
+
379
+ app.watt.runtime.sendCommandToApplication = async (workerId, command) => {
380
+ if (command === 'getLastProfile') {
381
+ getProfileCalls.push(workerId)
382
+ if (workerId === 'service-1:2') {
383
+ throw new Error('Worker not available')
384
+ }
385
+ return new Uint8Array([1, 2, 3])
386
+ }
387
+ return { success: false }
388
+ }
389
+
390
+ // Mock HTTP server
391
+ const { createServer } = await import('node:http')
392
+ const server = createServer((req, res) => {
393
+ const body = []
394
+ req.on('data', chunk => body.push(chunk))
395
+ req.on('end', () => {
396
+ res.writeHead(200)
397
+ res.end()
398
+ })
399
+ })
400
+
401
+ await new Promise(resolve => server.listen(port + 12, resolve))
402
+ t.after(() => server.close())
403
+
404
+ await flamegraphsPlugin(app)
405
+ await app.sendFlamegraphs({ workerIds: ['service-1:2'] })
406
+
407
+ equal(getProfileCalls.length, 2)
408
+ equal(getProfileCalls[0], 'service-1:2')
409
+ equal(getProfileCalls[1], 'service-1')
369
410
  })
370
411
 
371
412
  test('sendFlamegraphs should skip when PLT_DISABLE_FLAMEGRAPHS is set', async (t) => {
@@ -376,9 +417,9 @@ test('sendFlamegraphs should skip when PLT_DISABLE_FLAMEGRAPHS is set', async (t
376
417
 
377
418
  const getProfileCalls = []
378
419
 
379
- app.watt.runtime.sendCommandToApplication = async (serviceId, command) => {
420
+ app.watt.runtime.sendCommandToApplication = async (workerId, command) => {
380
421
  if (command === 'getLastProfile') {
381
- getProfileCalls.push(serviceId)
422
+ getProfileCalls.push(workerId)
382
423
  return new Uint8Array([1, 2, 3])
383
424
  }
384
425
  return { success: false }
@@ -452,6 +493,13 @@ test('should handle trigger-flamegraph command and upload flamegraphs from servi
452
493
  await app.connectToUpdates()
453
494
  await app.setupFlamegraphs()
454
495
 
496
+ t.after(async () => {
497
+ if (app.cleanupFlamegraphs) {
498
+ app.cleanupFlamegraphs()
499
+ }
500
+ await app.closeUpdates()
501
+ })
502
+
455
503
  await waitForClientSubscription
456
504
 
457
505
  const triggerFlamegraphMessage = {
@@ -473,9 +521,6 @@ test('should handle trigger-flamegraph command and upload flamegraphs from servi
473
521
 
474
522
  equal(service1Req.serviceId, 'service-1')
475
523
  equal(service2Req.serviceId, 'service-2')
476
-
477
- if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
478
- await app.closeUpdates()
479
524
  })
480
525
 
481
526
  test('should handle trigger-flamegraph when no runtime is available', async (t) => {
@@ -609,6 +654,13 @@ test('should handle trigger-heapprofile command and upload heap profiles from se
609
654
  await app.connectToUpdates()
610
655
  await app.setupFlamegraphs()
611
656
 
657
+ t.after(async () => {
658
+ if (app.cleanupFlamegraphs) {
659
+ app.cleanupFlamegraphs()
660
+ }
661
+ await app.closeUpdates()
662
+ })
663
+
612
664
  await waitForClientSubscription
613
665
 
614
666
  const triggerHeapProfileMessage = {
@@ -630,9 +682,6 @@ test('should handle trigger-heapprofile command and upload heap profiles from se
630
682
 
631
683
  equal(service1Req.serviceId, 'service-1')
632
684
  equal(service2Req.serviceId, 'service-2')
633
-
634
- if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
635
- await app.closeUpdates()
636
685
  })
637
686
 
638
687
  test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (t) => {
@@ -640,11 +689,6 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
640
689
 
641
690
  const receivedMessages = []
642
691
  const infoLogs = []
643
- let errorCount = 0
644
- let uploadResolve
645
- const allUploadsComplete = new Promise((resolve) => {
646
- uploadResolve = resolve
647
- })
648
692
 
649
693
  const wss = new WebSocketServer({ port: port + 4 })
650
694
  t.after(async () => wss.close())
@@ -655,19 +699,21 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
655
699
  true
656
700
  )
657
701
 
658
- const app = createMockApp(port + 4)
702
+ const app = createMockApp(port + 4, true, {
703
+ PLT_FLAMEGRAPHS_INTERVAL_SEC: 10,
704
+ PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT: 1000
705
+ })
706
+
659
707
  const originalInfo = app.log.info
660
708
  app.log.info = (...args) => {
661
709
  originalInfo(...args)
662
- if (args[1] && args[1].includes('No profile available for the service')) {
663
- infoLogs.push(args)
664
- errorCount++
665
- if (errorCount === 2) {
666
- uploadResolve()
667
- }
668
- }
710
+ infoLogs.push(args)
669
711
  }
670
712
 
713
+ // Profile will be generated in 10s
714
+ const profileGenerationDate = Date.now() + 10000
715
+ const mockProfile = new Uint8Array([1, 2, 3, 4, 5])
716
+
671
717
  app.watt.runtime.sendCommandToApplication = async (
672
718
  serviceId,
673
719
  command
@@ -676,9 +722,13 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
676
722
  return { success: true }
677
723
  }
678
724
  if (command === 'getLastProfile') {
679
- const error = new Error('No profile available - wait for profiling to complete or trigger manual capture')
680
- error.code = 'PLT_PPROF_NO_PROFILE_AVAILABLE'
681
- throw error
725
+ const now = Date.now()
726
+ if (now < profileGenerationDate) {
727
+ const error = new Error('No profile available - wait for profiling to complete or trigger manual capture')
728
+ error.code = 'PLT_PPROF_NO_PROFILE_AVAILABLE'
729
+ throw error
730
+ }
731
+ return mockProfile
682
732
  }
683
733
  return { success: false }
684
734
  }
@@ -689,6 +739,13 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
689
739
  await app.connectToUpdates()
690
740
  await app.setupFlamegraphs()
691
741
 
742
+ t.after(async () => {
743
+ if (app.cleanupFlamegraphs) {
744
+ app.cleanupFlamegraphs()
745
+ }
746
+ await app.closeUpdates()
747
+ })
748
+
692
749
  await waitForClientSubscription
693
750
 
694
751
  const triggerFlamegraphMessage = {
@@ -697,15 +754,47 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
697
754
 
698
755
  getWs().send(JSON.stringify(triggerFlamegraphMessage))
699
756
 
700
- await allUploadsComplete
757
+ await sleep(15000)
701
758
 
702
- equal(infoLogs.length, 2)
703
- equal(infoLogs[0][0].serviceId, 'service-1')
704
- equal(infoLogs[0][0].podId, 'test-pod-123')
705
- equal(infoLogs[0][1], 'No profile available for the service')
759
+ const service1AttemptLogs = []
760
+ const service2AttemptLogs = []
761
+ const service1SuccessLogs = []
762
+ const service2SuccessLogs = []
706
763
 
707
- if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
708
- await app.closeUpdates()
764
+ for (const infoLog of infoLogs) {
765
+ if (infoLog.length !== 2) continue
766
+ const [options, message] = infoLog
767
+
768
+ if (message.includes('No profile available for the service')) {
769
+ const { workerId, attempt, maxAttempts, attemptTimeout } = options
770
+
771
+ equal(maxAttempts, 11)
772
+ equal(attemptTimeout, 1000)
773
+
774
+ if (workerId === 'service-1') {
775
+ service1AttemptLogs.push(infoLog)
776
+ equal(attempt, service1AttemptLogs.length)
777
+ }
778
+ if (workerId === 'service-2') {
779
+ service2AttemptLogs.push(infoLog)
780
+ equal(attempt, service2AttemptLogs.length)
781
+ }
782
+ continue
783
+ }
784
+
785
+ if (message.includes('Sending flamegraph')) {
786
+ if (options.serviceId === 'service-1') {
787
+ service1SuccessLogs.push(infoLog)
788
+ } else if (options.serviceId === 'service-2') {
789
+ service2SuccessLogs.push(infoLog)
790
+ }
791
+ }
792
+ }
793
+
794
+ equal(service1AttemptLogs.length, 10)
795
+ equal(service2AttemptLogs.length, 10)
796
+ equal(service1SuccessLogs.length, 1)
797
+ equal(service2SuccessLogs.length, 1)
709
798
  })
710
799
 
711
800
  test('should handle PLT_PPROF_NOT_ENOUGH_ELU error with info log', async (t) => {
@@ -762,6 +851,13 @@ test('should handle PLT_PPROF_NOT_ENOUGH_ELU error with info log', async (t) =>
762
851
  await app.connectToUpdates()
763
852
  await app.setupFlamegraphs()
764
853
 
854
+ t.after(async () => {
855
+ if (app.cleanupFlamegraphs) {
856
+ app.cleanupFlamegraphs()
857
+ }
858
+ await app.closeUpdates()
859
+ })
860
+
765
861
  await waitForClientSubscription
766
862
 
767
863
  const triggerFlamegraphMessage = {
@@ -773,12 +869,8 @@ test('should handle PLT_PPROF_NOT_ENOUGH_ELU error with info log', async (t) =>
773
869
  await allUploadsComplete
774
870
 
775
871
  equal(infoLogs.length, 2)
776
- equal(infoLogs[0][0].serviceId, 'service-1')
777
- equal(infoLogs[0][0].podId, 'test-pod-123')
872
+ equal(infoLogs[0][0].workerId, 'service-1')
778
873
  equal(infoLogs[0][1], 'ELU low, CPU profiling not active')
779
-
780
- if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
781
- await app.closeUpdates()
782
874
  })
783
875
 
784
876
  test('should start profiling on new workers that start after initial setup', async (t) => {
@@ -815,6 +907,13 @@ test('should start profiling on new workers that start after initial setup', asy
815
907
  await app.connectToUpdates()
816
908
  await app.setupFlamegraphs()
817
909
 
910
+ t.after(async () => {
911
+ if (app.cleanupFlamegraphs) {
912
+ app.cleanupFlamegraphs()
913
+ }
914
+ await app.closeUpdates()
915
+ })
916
+
818
917
  await waitForClientSubscription
819
918
 
820
919
  equal(startProfilingCalls.length, 4)
@@ -844,9 +943,6 @@ test('should start profiling on new workers that start after initial setup', asy
844
943
  equal(startProfilingCalls[5].options.durationMillis, 1000)
845
944
  equal(startProfilingCalls[5].options.eluThreshold, 0)
846
945
  equal(startProfilingCalls[5].options.type, 'heap')
847
-
848
- if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
849
- await app.closeUpdates()
850
946
  })
851
947
 
852
948
  test('should not start profiling on new workers when flamegraphs are disabled', async (t) => {
@@ -884,6 +980,13 @@ test('should not start profiling on new workers when flamegraphs are disabled',
884
980
  await app.connectToUpdates()
885
981
  await app.setupFlamegraphs()
886
982
 
983
+ t.after(async () => {
984
+ if (app.cleanupFlamegraphs) {
985
+ app.cleanupFlamegraphs()
986
+ }
987
+ await app.closeUpdates()
988
+ })
989
+
887
990
  await waitForClientSubscription
888
991
 
889
992
  equal(startProfilingCalls.length, 0)
@@ -897,9 +1000,6 @@ test('should not start profiling on new workers when flamegraphs are disabled',
897
1000
  await sleep(10)
898
1001
 
899
1002
  equal(startProfilingCalls.length, 0)
900
-
901
- if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
902
- await app.closeUpdates()
903
1003
  })
904
1004
 
905
1005
  test('sendFlamegraphs should include alertId in query params when provided', async (t) => {