npm - @platformatic/watt-extra - Versions diffs - 1.6.2 → 1.6.3-alpha.1 - Mend

@platformatic/watt-extra 1.6.2 → 1.6.3-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json +9 -9
package/plugins/alerts.js +8 -9
package/plugins/env.js +1 -0
package/plugins/flamegraphs.js +145 -38
package/plugins/health-signals.js +11 -8
package/plugins/update.js +2 -2
package/test/alerts.test.js +212 -4
package/test/helper.js +3 -0
package/test/trigger-flamegraphs.test.js +149 -49

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@platformatic/watt-extra",
-  "version": "1.6.2",
+  "version": "1.6.3-alpha.1",
   "description": "The Platformatic runtime manager",
   "type": "module",
   "scripts": {
@@ -19,10 +19,10 @@
   },
   "devDependencies": {
     "@fastify/websocket": "^11.1.0",
-    "@platformatic/composer": "^3.22.0",
-    "@platformatic/next": "^3.22.0",
-    "@platformatic/node": "^3.22.0",
-    "@platformatic/service": "^3.22.0",
+    "@platformatic/composer": "^3.25.0",
+    "@platformatic/next": "^3.25.0",
+    "@platformatic/node": "^3.25.0",
+    "@platformatic/service": "^3.25.0",
     "atomic-sleep": "^1.0.0",
     "borp": "^0.21.0",
     "eslint": "9",
@@ -30,16 +30,16 @@
     "fastify-plugin": "^5.0.1",
     "neostandard": "^0.12.0",
     "next": "^16.0.0",
-    "platformatic": "^3.22.0",
+    "platformatic": "^3.25.0",
     "pprof-format": "^2.1.0",
     "why-is-node-running": "^2.3.0"
   },
   "dependencies": {
     "@datadog/pprof": "^5.9.0",
     "@fastify/error": "^4.2.0",
-    "@platformatic/foundation": "^3.22.0",
-    "@platformatic/runtime": "^3.22.0",
-    "@platformatic/wattpm-pprof-capture": "^3.22.0",
+    "@platformatic/foundation": "^3.25.0",
+    "@platformatic/runtime": "^3.25.0",
+    "@platformatic/wattpm-pprof-capture": "^3.25.0",
     "avvio": "^9.1.0",
     "chalk": "^4.1.2",
     "commist": "^3.2.0",

package/plugins/alerts.js CHANGED Viewed

@@ -3,9 +3,9 @@ import { request } from 'undici'
 async function alerts (app, _opts) {
   const healthCache = [] // It's OK to have this in memory, this is per-pod.
   const podHealthWindow =
-    app.instanceConfig?.config?.scaler?.podHealthWindow || 60 * 1000
+    app.instanceConfig?.scaler?.podHealthWindow || 60 * 1000
   const alertRetentionWindow =
-    app.instanceConfig?.config?.scaler?.alertRetentionWindow || 10 * 1000
+    app.instanceConfig?.scaler?.alertRetentionWindow || 10 * 1000
   const lastServicesAlertTime = {}
@@ -40,6 +40,7 @@ async function alerts (app, _opts) {
       }
       const timestamp = Date.now()
+      const workerId = healthInfo.id
       const serviceId = healthInfo.application
       const healthWithTimestamp = { ...healthInfo, timestamp, service: serviceId }
       delete healthWithTimestamp.healthConfig // we don't need to store this
@@ -111,14 +112,12 @@ async function alerts (app, _opts) {
         const alert = await body.json()
-        try {
-          await app.sendFlamegraphs({
-            serviceIds: [serviceId],
-            alertId: alert.id
-          })
-        } catch (err) {
+        app.sendFlamegraphs({
+          workerIds: [workerId],
+          alertId: alert.id
+        }).catch(err => {
           app.log.error({ err }, 'Failed to send a flamegraph')
-        }
+        })
       }
     }

package/plugins/env.js CHANGED Viewed

@@ -21,6 +21,7 @@ const schema = {
     PLT_FLAMEGRAPHS_INTERVAL_SEC: { type: 'number', default: 60 },
     PLT_FLAMEGRAPHS_ELU_THRESHOLD: { type: 'number', default: 0.4 },
     PLT_FLAMEGRAPHS_GRACE_PERIOD: { type: 'number', default: 3000 },
+    PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT: { type: 'number', default: 10000 },
     PLT_JWT_EXPIRATION_OFFSET_SEC: { type: 'number', default: 60 },
     PLT_UPDATES_RECONNECT_INTERVAL_SEC: { type: 'number', default: 1 },
     PLT_ELU_HEALTH_SIGNAL_THRESHOLD: { type: 'number', default: 0.8 },

package/plugins/flamegraphs.js CHANGED Viewed

@@ -8,10 +8,13 @@ async function flamegraphs (app, _opts) {
   const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
   const flamegraphsELUThreshold = app.env.PLT_FLAMEGRAPHS_ELU_THRESHOLD
   const flamegraphsGracePeriod = app.env.PLT_FLAMEGRAPHS_GRACE_PERIOD
+  const flamegraphsAttemptTimeout = app.env.PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT
   const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
   const eluThreshold = parseFloat(flamegraphsELUThreshold)
   const gracePeriod = parseInt(flamegraphsGracePeriod)
+  const attemptTimeout = Math.min(parseInt(flamegraphsAttemptTimeout), durationMillis)
+  const maxAttempts = Math.ceil(durationMillis / attemptTimeout) + 1
   let workerStartedListener = null
@@ -125,13 +128,15 @@ async function flamegraphs (app, _opts) {
     }
   }
+  const profilesByWorkerId = {}
   app.sendFlamegraphs = async (options = {}) => {
     if (isFlamegraphsDisabled) {
       app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
       return
     }
-    let { serviceIds, alertId, profileType = 'cpu' } = options
+    let { workerIds, alertId, profileType = 'cpu' } = options
     const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
     if (!scalerUrl) {
@@ -139,61 +144,163 @@ async function flamegraphs (app, _opts) {
       throw new Error('No scaler URL found in ICC services, cannot send flamegraph')
     }
-    const podId = app.instanceId
     const runtime = app.watt.runtime
-    if (!serviceIds) {
+    if (!workerIds) {
       const { applications } = await runtime.getApplications()
-      serviceIds = applications.map(app => app.id)
+      workerIds = applications.map(app => app.id)
     }
-    const authHeaders = await app.getAuthorizationHeader()
+    cleanupFlamegraphsCache()
-    const uploadPromises = serviceIds.map(async (serviceId) => {
-      try {
-        const profile = await runtime.sendCommandToApplication(serviceId, 'getLastProfile', { type: profileType })
-        if (!profile || !(profile instanceof Uint8Array)) {
-          app.log.error({ serviceId }, 'Failed to get profile from service')
+    const uploadPromises = workerIds.map(async (workerId) => {
+      let profile = profilesByWorkerId[workerId]
+      if (profile?.flamegraphId) {
+        const { flamegraphId } = profile
+        try {
+          await attachFlamegraphToAlerts(scalerUrl, flamegraphId, [alertId])
           return
+        } catch (err) {
+          if (err.code === 'PLT_ATTACH_FLAMEGRAPH_MULTIPLE_ALERTS_NOT_SUPPORTED') {
+            app.log.warn(
+              'Attaching flamegraph multiple alerts is not supported by the scaler.' +
+                ' Please upgrade to the latest ICC version to use this feature.'
+            )
+          } else {
+            app.log.error({ err, workerId, alertId, flamegraphId }, 'Failed to attach flamegraph to alert')
+          }
         }
+      }
-        const url = `${scalerUrl}/pods/${podId}/services/${serviceId}/flamegraph`
+      if (!profile) {
+        profile = await getServiceFlamegraph(workerId, profileType)
+        if (!profile || !(profile.data instanceof Uint8Array)) {
+          app.log.error({ workerId }, 'Failed to get profile from service')
+          return
+        }
+      }
-        app.log.info({ serviceId, podId, profileType }, 'Sending flamegraph')
+      profilesByWorkerId[workerId] = profile
-        const query = { profileType }
-        if (alertId) {
-          query.alertId = alertId
-        }
+      const serviceId = workerId.split(':')[0]
-        const { statusCode, body } = await request(url, {
-          method: 'POST',
-          headers: {
-            'Content-Type': 'application/octet-stream',
-            ...authHeaders
-          },
-          query,
-          body: profile
-        })
-        if (statusCode !== 200) {
-          const error = await body.text()
-          app.log.error({ error }, 'Failed to send flamegraph')
-          throw new Error(`Failed to send flamegraph: ${error}`)
-        }
+      try {
+        const flamegraph = await sendServiceFlamegraph(
+          scalerUrl,
+          serviceId,
+          profile.data,
+          profileType,
+          alertId
+        )
+        profile.flamegraphId = flamegraph.id
       } catch (err) {
-        if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
-          app.log.info({ serviceId, podId }, 'No profile available for the service')
-        } else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
-          app.log.info({ serviceId, podId }, 'ELU low, CPU profiling not active')
-        } else {
-          app.log.warn({ err, serviceId, podId }, 'Failed to send flamegraph from service')
-        }
+        app.log.error({ err, workerId, alertId, profileType }, 'Failed to send flamegraph')
       }
     })
     await Promise.all(uploadPromises)
   }
+  async function getServiceFlamegraph (workerId, profileType, attempt = 1) {
+    const runtime = app.watt.runtime
+    try {
+      const [state, profile] = await Promise.all([
+        runtime.sendCommandToApplication(workerId, 'getProfilingState', { type: profileType }),
+        runtime.sendCommandToApplication(workerId, 'getLastProfile', { type: profileType })
+      ])
+      return { data: profile, timestamp: state.latestProfileTimestamp }
+    } catch (err) {
+      if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
+        app.log.info(
+          { workerId, attempt, maxAttempts, attemptTimeout },
+          'No profile available for the service. Waiting for profiling to complete.'
+        )
+        if (attempt <= maxAttempts) {
+          await sleep(attemptTimeout)
+          return getServiceFlamegraph(workerId, profileType, attempt + 1)
+        }
+      } else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
+        app.log.info({ workerId }, 'ELU low, CPU profiling not active')
+      } else {
+        app.log.warn({ err, workerId }, 'Failed to get profile from a worker')
+        const [serviceId, workerIndex] = workerId.split(':')
+        if (workerIndex) {
+          app.log.warn('Worker not available, trying to get profile from another worker')
+          return getServiceFlamegraph(serviceId, profileType)
+        }
+      }
+    }
+  }
+  async function sendServiceFlamegraph (scalerUrl, serviceId, profile, profileType, alertId) {
+    const podId = app.instanceId
+    const url = `${scalerUrl}/pods/${podId}/services/${serviceId}/flamegraph`
+    app.log.info({ serviceId, podId, profileType }, 'Sending flamegraph')
+    const query = { profileType }
+    if (alertId) {
+      query.alertId = alertId
+    }
+    const authHeaders = await app.getAuthorizationHeader()
+    const { statusCode, body } = await request(url, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/octet-stream',
+        ...authHeaders
+      },
+      query,
+      body: profile
+    })
+    if (statusCode !== 200) {
+      const error = await body.text()
+      app.log.error({ error }, 'Failed to send flamegraph')
+      throw new Error(`Failed to send flamegraph: ${error}`)
+    }
+    const response = await body.json()
+    return response
+  }
+  async function attachFlamegraphToAlerts (scalerUrl, flamegraphId, alertIds) {
+    const url = `${scalerUrl}/flamegraphs/${flamegraphId}/alerts`
+    app.log.info({ flamegraphId, alerts: alertIds }, 'Attaching flamegraph to alerts')
+    const authHeaders = await app.getAuthorizationHeader()
+    const { statusCode, body } = await request(url, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        ...authHeaders
+      },
+      body: JSON.stringify({ alertIds })
+    })
+    if (statusCode !== 200) {
+      const error = await body.text()
+      if (statusCode === 404 && error.includes('Route POST')) {
+        const err = new Error('Attaching flamegraph multiple alerts is not supported by the scaler')
+        err.code = 'PLT_ATTACH_FLAMEGRAPH_MULTIPLE_ALERTS_NOT_SUPPORTED'
+        throw err
+      }
+      throw new Error(`Failed to attach flamegraph to alerts: ${error}`)
+    }
+  }
+  function cleanupFlamegraphsCache () {
+    const now = Date.now()
+    for (const workerId of Object.keys(profilesByWorkerId)) {
+      const { timestamp } = profilesByWorkerId[workerId]
+      if (now - timestamp > durationMillis) {
+        delete profilesByWorkerId[workerId]
+      }
+    }
+  }
 }
 export default flamegraphs

package/plugins/health-signals.js CHANGED Viewed

@@ -80,6 +80,7 @@ async function healthSignals (app, _opts) {
       }
       const {
+        id: workerId,
         application: serviceId,
         currentHealth,
         healthSignals
@@ -125,13 +126,13 @@ async function healthSignals (app, _opts) {
       }
       if (healthSignals.length > 0) {
-        await sendHealthSignalsWithTimeout(serviceId, healthSignals)
+        await sendHealthSignalsWithTimeout(serviceId, workerId, healthSignals)
       }
     })
   }
   app.setupHealthSignals = setupHealthSignals
-  async function sendHealthSignalsWithTimeout (serviceId, signals) {
+  async function sendHealthSignalsWithTimeout (serviceId, workerId, signals) {
     signalsCaches[serviceId] ??= new HealthSignalsCache()
     servicesSendingStatuses[serviceId] ??= false
@@ -148,7 +149,7 @@ async function healthSignals (app, _opts) {
         try {
           const signals = signalsCache.getAll()
-          await sendHealthSignals(serviceId, signals, metrics)
+          await sendHealthSignals(serviceId, workerId, signals, metrics)
         } catch (err) {
           app.log.error({ err }, 'Failed to send health signals to scaler')
         }
@@ -156,7 +157,7 @@ async function healthSignals (app, _opts) {
     }
   }
-  async function sendHealthSignals (serviceId, signals, metrics) {
+  async function sendHealthSignals (serviceId, workerId, signals, metrics) {
     const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
     const applicationId = app.instanceConfig?.applicationId
     const authHeaders = await app.getAuthorizationHeader()
@@ -184,11 +185,13 @@ async function healthSignals (app, _opts) {
     const alert = await body.json()
-    try {
-      await app.sendFlamegraphs({ serviceIds: [serviceId], alertId: alert.id })
-    } catch (err) {
+    app.sendFlamegraphs({
+      serviceIds: [serviceId],
+      workerIds: [workerId],
+      alertId: alert.id
+    }).catch(err => {
       app.log.error({ err }, 'Failed to send a flamegraph')
-    }
+    })
   }
 }

package/plugins/update.js CHANGED Viewed

@@ -23,14 +23,14 @@ async function updatePlugin (app) {
       // Handle trigger-flamegraph command from ICC
       if (command === 'trigger-flamegraph') {
         app.log.info({ command }, 'Received trigger-flamegraph command from ICC')
-        await app.sendFlamegraphs({ profileType: 'cpu' })
+        app.sendFlamegraphs({ profileType: 'cpu' })
         return
       }
       // Handle trigger-heapprofile command from ICC
       if (command === 'trigger-heapprofile') {
         app.log.info({ command }, 'Received trigger-heapprofile command from ICC')
-        await app.sendFlamegraphs({ profileType: 'heap' })
+        app.sendFlamegraphs({ profileType: 'heap' })
         return
       }

package/test/alerts.test.js CHANGED Viewed

@@ -378,10 +378,8 @@ test('should respect alert retention window', async (t) => {
   const icc = await startICC(t, {
     applicationId,
     applicationName,
-    iccConfig: {
-      scaler: {
-        alertRetentionWindow: 500
-      }
+    scaler: {
+      alertRetentionWindow: 500
     },
     processAlerts: (req) => {
       const alert = req.body
@@ -740,3 +738,213 @@ test('should handle old runtime (< 3.18.0) health events', async (t) => {
   assert.deepStrictEqual(alertReceived.alert.currentHealth, healthInfo.currentHealth)
   assert.strictEqual(alertReceived.alert.healthConfig, undefined, 'healthConfig should be deleted from alert')
 })
+test('should attach one flamegraph to multiple alerts', async (t) => {
+  const applicationName = 'test-app'
+  const applicationId = randomUUID()
+  const applicationPath = join(__dirname, 'fixtures', 'service-1')
+  const receivedAlerts = []
+  const receivedFlamegraphs = []
+  const receivedAttachedFlamegraphs = []
+  const getAuthorizationHeader = async (headers) => {
+    return { ...headers, authorization: 'Bearer test-token' }
+  }
+  const icc = await startICC(t, {
+    applicationId,
+    applicationName,
+    scaler: {
+      podHealthWindow: 1,
+      alertRetentionWindow: 1
+    },
+    processAlerts: (req) => {
+      assert.equal(req.headers.authorization, 'Bearer test-token')
+      const alert = req.body
+      alert.id = `alert-${receivedAlerts.length + 1}`
+      receivedAlerts.push(alert)
+      return alert
+    },
+    processFlamegraphs: (req) => {
+      assert.strictEqual(req.headers.authorization, 'Bearer test-token')
+      const flamegraphId = `flamegraph-${receivedFlamegraphs.length + 1}`
+      const alertId = req.query.alertId
+      receivedFlamegraphs.push({ id: flamegraphId, alertId })
+      return { id: flamegraphId }
+    },
+    attachFlamegraphToAlerts: (req) => {
+      assert.strictEqual(req.headers.authorization, 'Bearer test-token')
+      const flamegraphId = req.params.flamegraphId
+      const { alertIds } = req.body
+      receivedAttachedFlamegraphs.push({ flamegraphId, alertIds })
+      return {}
+    }
+  })
+  setUpEnvironment({
+    PLT_APP_NAME: applicationName,
+    PLT_APP_DIR: applicationPath,
+    PLT_ICC_URL: 'http://127.0.0.1:3000',
+    PLT_DISABLE_FLAMEGRAPHS: false,
+    PLT_FLAMEGRAPHS_INTERVAL_SEC: 5,
+    PLT_FLAMEGRAPHS_ELU_THRESHOLD: 0
+  })
+  const app = await start()
+  app.getAuthorizationHeader = getAuthorizationHeader
+  t.after(async () => {
+    await app.close()
+    await icc.close()
+  })
+  // Wait for the first flamegraph to be generated
+  await sleep(5000)
+  // Manually trigger health event with unhealthy state
+  const healthInfo = {
+    id: 'main:0',
+    application: 'main',
+    currentHealth: {
+      elu: 0.995,
+      heapUsed: 76798040,
+      heapTotal: 99721216
+    },
+    unhealthy: true,
+    healthConfig: {
+      enabled: true,
+      interval: 1000,
+      gracePeriod: 1000,
+      maxUnhealthyChecks: 10,
+      maxELU: 0.99,
+      maxHeapUsed: 0.99,
+      maxHeapTotal: 4294967296
+    }
+  }
+  emitHealthEvent(app, healthInfo)
+  await sleep(1000)
+  emitHealthEvent(app, healthInfo)
+  // Wait for flamegraphs to be sent
+  await sleep(1000)
+  assert.strictEqual(receivedAlerts.length, 2)
+  const alert1 = receivedAlerts[0]
+  const alert2 = receivedAlerts[1]
+  assert.strictEqual(alert1.id, 'alert-1')
+  assert.strictEqual(alert2.id, 'alert-2')
+  assert.strictEqual(receivedFlamegraphs.length, 1)
+  const flamegraph = receivedFlamegraphs[0]
+  assert.strictEqual(flamegraph.id, 'flamegraph-1')
+  assert.strictEqual(flamegraph.alertId, 'alert-1')
+  assert.strictEqual(receivedAttachedFlamegraphs.length, 1)
+  const attachedFlamegraph = receivedAttachedFlamegraphs[0]
+  assert.strictEqual(attachedFlamegraph.flamegraphId, 'flamegraph-1')
+  assert.deepStrictEqual(attachedFlamegraph.alertIds, ['alert-2'])
+})
+test('should send flamegraphs if attaching fails', async (t) => {
+  const applicationName = 'test-app'
+  const applicationId = randomUUID()
+  const applicationPath = join(__dirname, 'fixtures', 'service-1')
+  const receivedAlerts = []
+  const receivedFlamegraphs = []
+  const getAuthorizationHeader = async (headers) => {
+    return { ...headers, authorization: 'Bearer test-token' }
+  }
+  const icc = await startICC(t, {
+    applicationId,
+    applicationName,
+    scaler: {
+      podHealthWindow: 1,
+      alertRetentionWindow: 1
+    },
+    processAlerts: (req) => {
+      assert.equal(req.headers.authorization, 'Bearer test-token')
+      const alert = req.body
+      alert.id = `alert-${receivedAlerts.length + 1}`
+      receivedAlerts.push(alert)
+      return alert
+    },
+    processFlamegraphs: (req) => {
+      assert.strictEqual(req.headers.authorization, 'Bearer test-token')
+      const flamegraphId = `flamegraph-${receivedFlamegraphs.length + 1}`
+      const alertId = req.query.alertId
+      receivedFlamegraphs.push({ id: flamegraphId, alertId })
+      return { id: flamegraphId }
+    },
+    attachFlamegraphToAlerts: (req) => {
+      throw new Error('Failed to attach flamegraph')
+    }
+  })
+  setUpEnvironment({
+    PLT_APP_NAME: applicationName,
+    PLT_APP_DIR: applicationPath,
+    PLT_ICC_URL: 'http://127.0.0.1:3000',
+    PLT_DISABLE_FLAMEGRAPHS: false,
+    PLT_FLAMEGRAPHS_INTERVAL_SEC: 5,
+    PLT_FLAMEGRAPHS_ELU_THRESHOLD: 0
+  })
+  const app = await start()
+  app.getAuthorizationHeader = getAuthorizationHeader
+  t.after(async () => {
+    await app.close()
+    await icc.close()
+  })
+  // Wait for the first flamegraph to be generated
+  await sleep(5000)
+  // Manually trigger health event with unhealthy state
+  const healthInfo = {
+    id: 'main:0',
+    application: 'main',
+    currentHealth: {
+      elu: 0.995,
+      heapUsed: 76798040,
+      heapTotal: 99721216
+    },
+    unhealthy: true,
+    healthConfig: {
+      enabled: true,
+      interval: 1000,
+      gracePeriod: 1000,
+      maxUnhealthyChecks: 10,
+      maxELU: 0.99,
+      maxHeapUsed: 0.99,
+      maxHeapTotal: 4294967296
+    }
+  }
+  emitHealthEvent(app, healthInfo)
+  await sleep(1000)
+  emitHealthEvent(app, healthInfo)
+  // Wait for flamegraphs to be sent
+  await sleep(1000)
+  assert.strictEqual(receivedAlerts.length, 2)
+  const alert1 = receivedAlerts[0]
+  const alert2 = receivedAlerts[1]
+  assert.strictEqual(alert1.id, 'alert-1')
+  assert.strictEqual(alert2.id, 'alert-2')
+  assert.strictEqual(receivedFlamegraphs.length, 2)
+  const flamegraph1 = receivedFlamegraphs[0]
+  assert.strictEqual(flamegraph1.id, 'flamegraph-1')
+  assert.strictEqual(flamegraph1.alertId, 'alert-1')
+  const flamegraph2 = receivedFlamegraphs[1]
+  assert.strictEqual(flamegraph2.id, 'flamegraph-2')
+  assert.strictEqual(flamegraph2.alertId, 'alert-2')
+})

package/test/helper.js CHANGED Viewed

@@ -199,6 +199,9 @@ async function startICC (t, opts = {}) {
     icc.post('/pods/:podId/services/:serviceId/flamegraph', async (req) => {
       return opts.processFlamegraphs?.(req)
     })
+    icc.post('/flamegraphs/:flamegraphId/alerts', async (req) => {
+      return opts.attachFlamegraphToAlerts?.(req)
+    })
   }, { prefix: '/scaler' })
   // Cron

package/test/trigger-flamegraphs.test.js CHANGED Viewed

@@ -35,7 +35,7 @@ function setupMockIccServer (wss, receivedMessages, validateAuth = false) {
   return { waitForClientSubscription, getWs: () => ws }
 }
-function createMockApp (port, includeScalerUrl = true) {
+function createMockApp (port, includeScalerUrl = true, env = {}) {
   const eventListeners = new Map()
   const mockWatt = {
@@ -100,7 +100,9 @@ function createMockApp (port, includeScalerUrl = true) {
       PLT_DISABLE_FLAMEGRAPHS: false,
       PLT_FLAMEGRAPHS_INTERVAL_SEC: 1,
       PLT_FLAMEGRAPHS_ELU_THRESHOLD: 0,
-      PLT_FLAMEGRAPHS_GRACE_PERIOD: 0
+      PLT_FLAMEGRAPHS_GRACE_PERIOD: 0,
+      PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT: 1000,
+      ...env
     },
     watt: mockWatt
   }
@@ -333,15 +335,15 @@ test('sendFlamegraphs should handle missing profile data', async (t) => {
   equal(errors.length, 2, 'Should log errors for both services with missing profiles')
 })
-test('sendFlamegraphs should filter by serviceIds when provided', async (t) => {
+test('sendFlamegraphs should filter by workerIds when provided', async (t) => {
   setUpEnvironment()
   const app = createMockApp(port + 12)
   const getProfileCalls = []
-  app.watt.runtime.sendCommandToApplication = async (serviceId, command) => {
+  app.watt.runtime.sendCommandToApplication = async (workerId, command) => {
     if (command === 'getLastProfile') {
-      getProfileCalls.push(serviceId)
+      getProfileCalls.push(workerId)
       return new Uint8Array([1, 2, 3])
     }
     return { success: false }
@@ -362,10 +364,49 @@ test('sendFlamegraphs should filter by serviceIds when provided', async (t) => {
   t.after(() => server.close())
   await flamegraphsPlugin(app)
-  await app.sendFlamegraphs({ serviceIds: ['service-1'] })
+  await app.sendFlamegraphs({ workerIds: ['service-1:0'] })
   equal(getProfileCalls.length, 1, 'Should only request profile for specified service')
-  equal(getProfileCalls[0], 'service-1', 'Should request profile for service-1')
+  equal(getProfileCalls[0], 'service-1:0', 'Should request profile for service-1')
+})
+test('sendFlamegraphs should try to get the profile from a service if worker is not available', async (t) => {
+  setUpEnvironment()
+  const app = createMockApp(port + 12)
+  const getProfileCalls = []
+  app.watt.runtime.sendCommandToApplication = async (workerId, command) => {
+    if (command === 'getLastProfile') {
+      getProfileCalls.push(workerId)
+      if (workerId === 'service-1:2') {
+        throw new Error('Worker not available')
+      }
+      return new Uint8Array([1, 2, 3])
+    }
+    return { success: false }
+  }
+  // Mock HTTP server
+  const { createServer } = await import('node:http')
+  const server = createServer((req, res) => {
+    const body = []
+    req.on('data', chunk => body.push(chunk))
+    req.on('end', () => {
+      res.writeHead(200)
+      res.end()
+    })
+  })
+  await new Promise(resolve => server.listen(port + 12, resolve))
+  t.after(() => server.close())
+  await flamegraphsPlugin(app)
+  await app.sendFlamegraphs({ workerIds: ['service-1:2'] })
+  equal(getProfileCalls.length, 2)
+  equal(getProfileCalls[0], 'service-1:2')
+  equal(getProfileCalls[1], 'service-1')
 })
 test('sendFlamegraphs should skip when PLT_DISABLE_FLAMEGRAPHS is set', async (t) => {
@@ -376,9 +417,9 @@ test('sendFlamegraphs should skip when PLT_DISABLE_FLAMEGRAPHS is set', async (t
   const getProfileCalls = []
-  app.watt.runtime.sendCommandToApplication = async (serviceId, command) => {
+  app.watt.runtime.sendCommandToApplication = async (workerId, command) => {
     if (command === 'getLastProfile') {
-      getProfileCalls.push(serviceId)
+      getProfileCalls.push(workerId)
       return new Uint8Array([1, 2, 3])
     }
     return { success: false }
@@ -452,6 +493,13 @@ test('should handle trigger-flamegraph command and upload flamegraphs from servi
   await app.connectToUpdates()
   await app.setupFlamegraphs()
+  t.after(async () => {
+    if (app.cleanupFlamegraphs) {
+      app.cleanupFlamegraphs()
+    }
+    await app.closeUpdates()
+  })
   await waitForClientSubscription
   const triggerFlamegraphMessage = {
@@ -473,9 +521,6 @@ test('should handle trigger-flamegraph command and upload flamegraphs from servi
   equal(service1Req.serviceId, 'service-1')
   equal(service2Req.serviceId, 'service-2')
-  if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
-  await app.closeUpdates()
 })
 test('should handle trigger-flamegraph when no runtime is available', async (t) => {
@@ -609,6 +654,13 @@ test('should handle trigger-heapprofile command and upload heap profiles from se
   await app.connectToUpdates()
   await app.setupFlamegraphs()
+  t.after(async () => {
+    if (app.cleanupFlamegraphs) {
+      app.cleanupFlamegraphs()
+    }
+    await app.closeUpdates()
+  })
   await waitForClientSubscription
   const triggerHeapProfileMessage = {
@@ -630,9 +682,6 @@ test('should handle trigger-heapprofile command and upload heap profiles from se
   equal(service1Req.serviceId, 'service-1')
   equal(service2Req.serviceId, 'service-2')
-  if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
-  await app.closeUpdates()
 })
 test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (t) => {
@@ -640,11 +689,6 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
   const receivedMessages = []
   const infoLogs = []
-  let errorCount = 0
-  let uploadResolve
-  const allUploadsComplete = new Promise((resolve) => {
-    uploadResolve = resolve
-  })
   const wss = new WebSocketServer({ port: port + 4 })
   t.after(async () => wss.close())
@@ -655,19 +699,21 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
     true
   )
-  const app = createMockApp(port + 4)
+  const app = createMockApp(port + 4, true, {
+    PLT_FLAMEGRAPHS_INTERVAL_SEC: 10,
+    PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT: 1000
+  })
   const originalInfo = app.log.info
   app.log.info = (...args) => {
     originalInfo(...args)
-    if (args[1] && args[1].includes('No profile available for the service')) {
-      infoLogs.push(args)
-      errorCount++
-      if (errorCount === 2) {
-        uploadResolve()
-      }
-    }
+    infoLogs.push(args)
   }
+  // Profile will be generated in 10s
+  const profileGenerationDate = Date.now() + 10000
+  const mockProfile = new Uint8Array([1, 2, 3, 4, 5])
   app.watt.runtime.sendCommandToApplication = async (
     serviceId,
     command
@@ -676,9 +722,13 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
       return { success: true }
     }
     if (command === 'getLastProfile') {
-      const error = new Error('No profile available - wait for profiling to complete or trigger manual capture')
-      error.code = 'PLT_PPROF_NO_PROFILE_AVAILABLE'
-      throw error
+      const now = Date.now()
+      if (now < profileGenerationDate) {
+        const error = new Error('No profile available - wait for profiling to complete or trigger manual capture')
+        error.code = 'PLT_PPROF_NO_PROFILE_AVAILABLE'
+        throw error
+      }
+      return mockProfile
     }
     return { success: false }
   }
@@ -689,6 +739,13 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
   await app.connectToUpdates()
   await app.setupFlamegraphs()
+  t.after(async () => {
+    if (app.cleanupFlamegraphs) {
+      app.cleanupFlamegraphs()
+    }
+    await app.closeUpdates()
+  })
   await waitForClientSubscription
   const triggerFlamegraphMessage = {
@@ -697,15 +754,47 @@ test('should handle PLT_PPROF_NO_PROFILE_AVAILABLE error with info log', async (
   getWs().send(JSON.stringify(triggerFlamegraphMessage))
-  await allUploadsComplete
+  await sleep(15000)
-  equal(infoLogs.length, 2)
-  equal(infoLogs[0][0].serviceId, 'service-1')
-  equal(infoLogs[0][0].podId, 'test-pod-123')
-  equal(infoLogs[0][1], 'No profile available for the service')
+  const service1AttemptLogs = []
+  const service2AttemptLogs = []
+  const service1SuccessLogs = []
+  const service2SuccessLogs = []
-  if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
-  await app.closeUpdates()
+  for (const infoLog of infoLogs) {
+    if (infoLog.length !== 2) continue
+    const [options, message] = infoLog
+    if (message.includes('No profile available for the service')) {
+      const { workerId, attempt, maxAttempts, attemptTimeout } = options
+      equal(maxAttempts, 11)
+      equal(attemptTimeout, 1000)
+      if (workerId === 'service-1') {
+        service1AttemptLogs.push(infoLog)
+        equal(attempt, service1AttemptLogs.length)
+      }
+      if (workerId === 'service-2') {
+        service2AttemptLogs.push(infoLog)
+        equal(attempt, service2AttemptLogs.length)
+      }
+      continue
+    }
+    if (message.includes('Sending flamegraph')) {
+      if (options.serviceId === 'service-1') {
+        service1SuccessLogs.push(infoLog)
+      } else if (options.serviceId === 'service-2') {
+        service2SuccessLogs.push(infoLog)
+      }
+    }
+  }
+  equal(service1AttemptLogs.length, 10)
+  equal(service2AttemptLogs.length, 10)
+  equal(service1SuccessLogs.length, 1)
+  equal(service2SuccessLogs.length, 1)
 })
 test('should handle PLT_PPROF_NOT_ENOUGH_ELU error with info log', async (t) => {
@@ -762,6 +851,13 @@ test('should handle PLT_PPROF_NOT_ENOUGH_ELU error with info log', async (t) =>
   await app.connectToUpdates()
   await app.setupFlamegraphs()
+  t.after(async () => {
+    if (app.cleanupFlamegraphs) {
+      app.cleanupFlamegraphs()
+    }
+    await app.closeUpdates()
+  })
   await waitForClientSubscription
   const triggerFlamegraphMessage = {
@@ -773,12 +869,8 @@ test('should handle PLT_PPROF_NOT_ENOUGH_ELU error with info log', async (t) =>
   await allUploadsComplete
   equal(infoLogs.length, 2)
-  equal(infoLogs[0][0].serviceId, 'service-1')
-  equal(infoLogs[0][0].podId, 'test-pod-123')
+  equal(infoLogs[0][0].workerId, 'service-1')
   equal(infoLogs[0][1], 'ELU low, CPU profiling not active')
-  if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
-  await app.closeUpdates()
 })
 test('should start profiling on new workers that start after initial setup', async (t) => {
@@ -815,6 +907,13 @@ test('should start profiling on new workers that start after initial setup', asy
   await app.connectToUpdates()
   await app.setupFlamegraphs()
+  t.after(async () => {
+    if (app.cleanupFlamegraphs) {
+      app.cleanupFlamegraphs()
+    }
+    await app.closeUpdates()
+  })
   await waitForClientSubscription
   equal(startProfilingCalls.length, 4)
@@ -844,9 +943,6 @@ test('should start profiling on new workers that start after initial setup', asy
   equal(startProfilingCalls[5].options.durationMillis, 1000)
   equal(startProfilingCalls[5].options.eluThreshold, 0)
   equal(startProfilingCalls[5].options.type, 'heap')
-  if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
-  await app.closeUpdates()
 })
 test('should not start profiling on new workers when flamegraphs are disabled', async (t) => {
@@ -884,6 +980,13 @@ test('should not start profiling on new workers when flamegraphs are disabled',
   await app.connectToUpdates()
   await app.setupFlamegraphs()
+  t.after(async () => {
+    if (app.cleanupFlamegraphs) {
+      app.cleanupFlamegraphs()
+    }
+    await app.closeUpdates()
+  })
   await waitForClientSubscription
   equal(startProfilingCalls.length, 0)
@@ -897,9 +1000,6 @@ test('should not start profiling on new workers when flamegraphs are disabled',
   await sleep(10)
   equal(startProfilingCalls.length, 0)
-  if (app.cleanupFlamegraphs) app.cleanupFlamegraphs()
-  await app.closeUpdates()
 })
 test('sendFlamegraphs should include alertId in query params when provided', async (t) => {