npm - @platformatic/watt-extra - Versions diffs - 1.6.3-alpha.2 → 1.6.3-alpha.3 - Mend

@platformatic/watt-extra 1.6.3-alpha.2 → 1.6.3-alpha.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/.claude/settings.local.json +10 -0
package/package.json +1 -1
package/plugins/alerts.js +1 -1
package/plugins/flamegraphs.js +227 -207
package/plugins/health-signals.js +1 -1
package/plugins/update.js +2 -2
package/test/alerts.test.js +7 -17
package/test/health-signals.test.js +2 -5
package/test/trigger-flamegraphs.test.js +187 -439

package/.claude/settings.local.json ADDED Viewed

@@ -0,0 +1,10 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(node --test-only:*)",
+      "Bash(node --test:*)"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@platformatic/watt-extra",
-  "version": "1.6.3-alpha.2",
+  "version": "1.6.3-alpha.3",
   "description": "The Platformatic runtime manager",
   "type": "module",
   "scripts": {

package/plugins/alerts.js CHANGED Viewed

@@ -112,7 +112,7 @@ async function alerts (app, _opts) {
         const alert = await body.json()
-        app.sendFlamegraphs({
+        app.requestFlamegraphs({
           workerIds: [workerId],
           alertId: alert.id
         }).catch(err => {

package/plugins/flamegraphs.js CHANGED Viewed

@@ -1,146 +1,200 @@
 'use strict'
-import { setTimeout as sleep } from 'node:timers/promises'
 import { request } from 'undici'
-async function flamegraphs (app, _opts) {
-  const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
-  const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
-  const flamegraphsELUThreshold = app.env.PLT_FLAMEGRAPHS_ELU_THRESHOLD
-  const flamegraphsGracePeriod = app.env.PLT_FLAMEGRAPHS_GRACE_PERIOD
-  const flamegraphsAttemptTimeout = app.env.PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT
-  const flamegraphsCacheCleanupInterval = app.env.PLT_FLAMEGRAPHS_CACHE_CLEANUP_INTERVAL
+export class Profiler {
+  #workerId
+  #type
+  #duration
+  #profileOptions
+  #runtime
+  #log
+  #requests
+  #isProfiling
+  #onProfile
+  #getProfileInterval
+  #stopProfileTimeout
+  constructor (options = {}) {
+    const { type, duration, workerId, sourceMaps, app, onProfile } = options
+    if (type !== 'cpu' && type !== 'heap') {
+      throw new Error('Invalid Profiler type. Must be either "cpu" or "heap"')
+    }
+    if (typeof duration !== 'number') {
+      throw new Error('Invalid Profiler duration. Must be a number')
+    }
+    if (typeof workerId !== 'string') {
+      throw new Error('Invalid Worker ID. Must be a string')
+    }
+    if (!workerId.includes(':')) {
+      throw new Error('Worker ID must include the service ID and worker index')
+    }
+    if (typeof onProfile !== 'function') {
+      throw new Error('Invalid onProfile handler. Must be a function')
+    }
-  const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
-  const eluThreshold = parseFloat(flamegraphsELUThreshold)
-  const gracePeriod = parseInt(flamegraphsGracePeriod)
-  const attemptTimeout = Math.min(parseInt(flamegraphsAttemptTimeout), durationMillis)
-  const maxAttempts = Math.ceil(durationMillis / attemptTimeout) + 1
-  const cacheCleanupInterval = parseInt(flamegraphsCacheCleanupInterval)
+    this.#type = type
+    this.#duration = duration
+    this.#workerId = workerId
+    this.#onProfile = onProfile
-  let workerStartedListener = null
+    this.#profileOptions = {
+      type,
+      durationMillis: duration,
+      sourceMaps: sourceMaps ?? false
+    }
-  const startProfilingOnWorker = async (runtime, workerFullId, logContext = {}) => {
-    await sleep(gracePeriod)
+    this.#requests = []
+    this.#isProfiling = false
-    // Get application details to read service-level sourceMaps setting
-    const appDetails = await runtime.getApplicationDetails(workerFullId)
-    const sourceMaps = appDetails.sourceMaps ?? false
+    this.#runtime = app.watt.runtime
+    this.#log = app.log.child({
+      workerId: this.#workerId,
+      profilerType: this.#type
+    })
+  }
-    try {
-      // Start CPU profiling
-      await runtime.sendCommandToApplication(
-        workerFullId,
-        'startProfiling',
-        { durationMillis, eluThreshold, type: 'cpu', sourceMaps }
-      )
+  async requestProfile (request = {}) {
+    request.timestamp ??= Date.now()
+    this.#requests.push(request)
+    this.#unscheduleStopProfiling()
-      // Start HEAP profiling
-      await runtime.sendCommandToApplication(
-        workerFullId,
-        'startProfiling',
-        { durationMillis, eluThreshold, type: 'heap', sourceMaps }
-      )
-    } catch (err) {
-      app.log.error({ err, ...logContext }, 'Failed to start profiling')
-      throw err
+    if (!this.#isProfiling) {
+      this.#startProfilingLoop()
     }
   }
-  app.setupFlamegraphs = async () => {
-    if (isFlamegraphsDisabled) {
-      app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
+  async stop () {
+    if (this.#getProfileInterval) {
+      clearInterval(this.#getProfileInterval)
+      this.#getProfileInterval = null
+    }
+    if (this.#stopProfileTimeout) {
+      clearTimeout(this.#stopProfileTimeout)
+      this.#stopProfileTimeout = null
+    }
+    if (this.#isProfiling) {
+      await this.#stopProfiling()
+    }
+  }
+  async #startProfilingLoop () {
+    try {
+      await this.#startProfiling()
+    } catch (err) {
+      this.#log.error({ err }, 'Failed to start profiling')
+      const requests = this.#getProfileRequests(Date.now())
+      this.#onProfile(err, null, requests)
       return
     }
-    app.log.info('Start profiling services')
+    this.#getProfileInterval = setInterval(
+      () => this.#processProfile(),
+      this.#duration
+    ).unref()
+  }
-    const runtime = app.watt.runtime
-    const workers = await runtime.getWorkers()
+  async #processProfile () {
+    try {
+      const profile = await this.#getProfile()
+      const requests = this.#getProfileRequests(profile.timestamp)
+      this.#onProfile(null, profile, requests)
+    } catch (err) {
+      this.#log.error({ err }, 'Failed to generate a profile')
+      const requests = this.#getProfileRequests(Date.now())
+      this.#onProfile(err, null, requests)
+    }
-    const promises = []
-    for (const [workerFullId, workerInfo] of Object.entries(workers)) {
-      if (workerInfo.status === 'started') {
-        const promise = startProfilingOnWorker(runtime, workerFullId, { workerFullId })
-        promises.push(promise)
-      }
+    if (this.#requests.length === 0) {
+      this.#scheduleStopProfiling()
     }
+  }
-    const results = await Promise.allSettled(promises)
-    for (const result of results) {
-      if (result.status === 'rejected') {
-        app.log.error({ result }, 'Failed to start profiling')
-      }
+  #scheduleStopProfiling () {
+    // Stop profiling after the duration/2 if there are no more requests
+    this.#stopProfileTimeout = setTimeout(
+      () => this.stop(),
+      this.#duration / 2
+    ).unref()
+  }
+  #unscheduleStopProfiling () {
+    if (this.#stopProfileTimeout) {
+      clearTimeout(this.#stopProfileTimeout)
+      this.#stopProfileTimeout = null
     }
+  }
-    // Listen for new workers starting and start profiling on them
-    workerStartedListener = ({ application, worker }) => {
-      if (isFlamegraphsDisabled) {
-        return
-      }
+  async #startProfiling () {
+    this.#isProfiling = true
+    this.#log.info('Starting profiling')
-      const workerFullId = [application, worker].join(':')
-      app.log.info({ application, worker }, 'Starting profiling on new worker')
+    await this.#runtime.sendCommandToApplication(
+      this.#workerId, 'startProfiling', this.#profileOptions
+    )
+  }
-      startProfilingOnWorker(runtime, workerFullId, { application, worker }).catch(() => {
-        // Error already logged in startProfilingOnWorker
-      })
-    }
-    runtime.on('application:worker:started', workerStartedListener)
+  async #stopProfiling () {
+    this.#isProfiling = false
+    this.#log.info('Stopping profiling')
-    setInterval(cleanupFlamegraphsCache, cacheCleanupInterval).unref()
+    await this.#runtime.sendCommandToApplication(
+      this.#workerId, 'stopProfiling', this.#profileOptions
+    )
   }
-  app.cleanupFlamegraphs = async () => {
-    if (workerStartedListener && app.watt?.runtime) {
-      app.watt.runtime.removeListener('application:worker:started', workerStartedListener)
-      workerStartedListener = null
-    }
+  async #getProfile () {
+    this.#log.info('Getting profile from worker')
-    // Explicitly stop all active profiling sessions to avoid memory corruption
-    if (!isFlamegraphsDisabled && app.watt?.runtime) {
-      try {
-        const workers = await app.watt.runtime.getWorkers()
-        const stopPromises = []
-        for (const workerFullId of Object.keys(workers)) {
-          // Stop both CPU and heap profiling on each worker
-          stopPromises.push(
-            app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'cpu' })
-              .catch(err => {
-                // Ignore errors if profiling wasn't running
-                if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
-                  app.log.warn({ err, workerFullId }, 'Failed to stop CPU profiling')
-                }
-              })
-          )
-          stopPromises.push(
-            app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'heap' })
-              .catch(err => {
-                // Ignore errors if profiling wasn't running
-                if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
-                  app.log.warn({ err, workerFullId }, 'Failed to stop heap profiling')
-                }
-              })
-          )
-        }
-        await Promise.all(stopPromises)
-        // Small delay to ensure native cleanup completes
-        await sleep(100)
-      } catch (err) {
-        app.log.warn({ err }, 'Failed to stop profiling during cleanup')
+    const [state, profile] = await Promise.all([
+      this.#runtime.sendCommandToApplication(this.#workerId, 'getProfilingState', { type: this.#type }),
+      this.#runtime.sendCommandToApplication(this.#workerId, 'getLastProfile', { type: this.#type })
+    ])
+    return { data: profile, timestamp: state.latestProfileTimestamp }
+  }
+  #getProfileRequests (profileTimestamp) {
+    let processedIndex = 0
+    for (let i = 0; i < this.#requests.length; i++) {
+      if (this.#requests[i].timestamp <= profileTimestamp) {
+        processedIndex = i + 1
       }
     }
+    return this.#requests.splice(0, processedIndex)
   }
+}
+async function flamegraphs (app, _opts) {
+  const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
+  const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
+  const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
-  const profilesByWorkerId = {}
+  const profilers = {}
+  const profilersConfigs = {}
-  app.sendFlamegraphs = async (options = {}) => {
+  app.setupFlamegraphs = async () => {
     if (isFlamegraphsDisabled) {
-      app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
+      app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
       return
     }
-    let { workerIds, alertId, profileType = 'cpu' } = options
+    const runtime = app.watt.runtime
+    const { applications } = await runtime.getApplications()
+    for (const application of applications) {
+      const appDetails = await runtime.getApplicationDetails(application.id)
+      const sourceMaps = appDetails.sourceMaps ?? false
+      profilersConfigs[application.id] = { durationMillis, sourceMaps }
+    }
+  }
+  app.requestFlamegraphs = async (options = {}) => {
+    if (isFlamegraphsDisabled) {
+      app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
+      return
+    }
     const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
     if (!scalerUrl) {
@@ -150,118 +204,89 @@ async function flamegraphs (app, _opts) {
     const runtime = app.watt.runtime
-    if (!workerIds) {
-      const { applications } = await runtime.getApplications()
-      workerIds = applications.map(app => app.id)
-    }
+    let { workerIds, alertId, profileType = 'cpu' } = options
-    cleanupFlamegraphsCache()
+    const servicesWorkers = {}
+    const workers = await runtime.getWorkers()
+    for (const workerId in workers) {
+      const workerInfo = workers[workerId]
+      const serviceId = workerInfo.application
-    const uploadPromises = workerIds.map(async (workerId) => {
-      const serviceId = workerId.split(':')[0]
-      const profileKey = `${workerId}:${profileType}`
+      servicesWorkers[serviceId] ??= []
+      servicesWorkers[serviceId].push(workerId)
+    }
-      let profile = profilesByWorkerId[profileKey]
-      if (profile !== undefined) {
-        if (alertId) {
-          app.log.info(
-            { workerId, alertId }, 'Flamegraph will be attached to the alert'
-          )
-          profile.waitingAlerts.push(alertId)
-        }
+    workerIds ??= Object.keys(servicesWorkers)
-        if (profile.flamegraphId === null) {
-          app.log.info({ workerId }, 'Waiting for flamegraph to be generated and sent')
-          return
-        }
+    for (let workerId of workerIds) {
+      const [serviceId, workerIndex] = workerId.split(':')
+      if (workerIndex === undefined) {
+        workerId = servicesWorkers[serviceId][0]
+      }
+      if (workerId === undefined) {
+        app.log.error({ serviceId }, 'No worker found for an application')
+        continue
       }
-      if (profile === undefined) {
-        profile = {
+      const profileKey = `${workerId}:${profileType}`
+      let profiler = profilers[profileKey]
+      if (!profiler) {
+        const config = profilersConfigs[serviceId]
+        profiler = new Profiler({
+          app,
+          workerId,
           type: profileType,
-          data: null,
-          timestamp: null,
-          flamegraphId: null,
-          waitingAlerts: []
-        }
-        profilesByWorkerId[profileKey] = profile
+          duration: config.durationMillis,
+          sourceMaps: config.sourceMaps,
+          onProfile: createProfileHandler(scalerUrl, workerId, profileType)
+        })
+        profilers[profileKey] = profiler
+      }
-        const result = await getServiceFlamegraph(workerId, profileType)
-        if (!result || !(result.data instanceof Uint8Array)) {
-          app.log.error({ workerId }, 'Failed to get profile from service')
-          delete profilesByWorkerId[profileKey]
-          return
-        }
+      profiler.requestProfile({ alertId })
+    }
+  }
+  function createProfileHandler (scalerUrl, workerId, profileType) {
+    const serviceId = workerId.split(':')[0]
-        profile.data = result.data
-        profile.timestamp = result.timestamp
+    return async (err, profile, requests) => {
+      if (err) {
+        app.log.error({ err }, 'Failed to generate a profile')
+        return
       }
-      if (profile.flamegraphId === null || !alertId) {
-        try {
-          const flamegraph = await sendServiceFlamegraph(
-            scalerUrl,
-            serviceId,
-            profile.data,
-            profileType,
-            alertId
-          )
-          profile.flamegraphId = flamegraph.id
-        } catch (err) {
-          app.log.error({ err, workerId, alertId, profileType }, 'Failed to send flamegraph')
-          delete profilesByWorkerId[profileKey]
-          return
+      const alertIds = []
+      for (const request of requests) {
+        if (request.alertId) {
+          alertIds.push(request.alertId)
         }
       }
-      const waitingAlerts = profile.waitingAlerts
-      if (waitingAlerts.length > 0) {
-        profile.waitingAlerts = []
-        await _attachFlamegraphToAlerts(
+      try {
+        const alertId = alertIds.shift()
+        const flamegraph = await sendServiceFlamegraph(
           scalerUrl,
           serviceId,
-          profile.flamegraphId,
           profile.data,
-          profile.type,
-          waitingAlerts
-        )
-      }
-    })
-    await Promise.all(uploadPromises)
-  }
-  async function getServiceFlamegraph (workerId, profileType, attempt = 1) {
-    const runtime = app.watt.runtime
-    app.log.info({ workerId, attempt, maxAttempts, attemptTimeout }, 'Getting profile from worker')
-    try {
-      const [state, profile] = await Promise.all([
-        runtime.sendCommandToApplication(workerId, 'getProfilingState', { type: profileType }),
-        runtime.sendCommandToApplication(workerId, 'getLastProfile', { type: profileType })
-      ])
-      return { data: profile, timestamp: state.latestProfileTimestamp }
-    } catch (err) {
-      if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
-        app.log.info(
-          { workerId, attempt, maxAttempts, attemptTimeout },
-          'No profile available for the service. Waiting for profiling to complete.'
+          profileType,
+          alertId
         )
-        if (attempt <= maxAttempts) {
-          await sleep(attemptTimeout)
-          return getServiceFlamegraph(workerId, profileType, attempt + 1)
-        }
-      } else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
-        app.log.info({ workerId }, 'ELU low, CPU profiling not active')
-      } else {
-        app.log.warn({ err, workerId }, 'Failed to get profile from a worker')
-        const [serviceId, workerIndex] = workerId.split(':')
-        if (workerIndex) {
-          app.log.warn('Worker not available, trying to get profile from another worker')
-          return getServiceFlamegraph(serviceId, profileType)
+        if (alertIds.length > 0) {
+          await _attachFlamegraphToAlerts(
+            scalerUrl,
+            serviceId,
+            flamegraph.id,
+            profile.data,
+            profileType,
+            alertIds
+          )
         }
+      } catch (err) {
+        app.log.error({ err, workerId }, 'Failed to send flamegraph')
       }
     }
   }
@@ -367,15 +392,10 @@ async function flamegraphs (app, _opts) {
     }
   }
-  function cleanupFlamegraphsCache () {
-    const now = Date.now()
-    for (const profileKey of Object.keys(profilesByWorkerId)) {
-      const timestamp = profilesByWorkerId[profileKey]?.timestamp
-      if (timestamp && now - timestamp > durationMillis) {
-        delete profilesByWorkerId[profileKey]
-      }
-    }
+  app.cleanupFlamegraphs = async () => {
+    // Stop all tracked profilers in parallel
+    const stopPromises = Object.values(profilers).map(profiler => profiler.stop())
+    await Promise.all(stopPromises)
   }
 }

package/plugins/health-signals.js CHANGED Viewed

@@ -185,7 +185,7 @@ async function healthSignals (app, _opts) {
     const alert = await body.json()
-    app.sendFlamegraphs({
+    app.requestFlamegraphs({
       serviceIds: [serviceId],
       workerIds: [workerId],
       alertId: alert.id

package/plugins/update.js CHANGED Viewed

@@ -23,14 +23,14 @@ async function updatePlugin (app) {
       // Handle trigger-flamegraph command from ICC
       if (command === 'trigger-flamegraph') {
         app.log.info({ command }, 'Received trigger-flamegraph command from ICC')
-        app.sendFlamegraphs({ profileType: 'cpu' })
+        app.requestFlamegraphs({ profileType: 'cpu' })
         return
       }
       // Handle trigger-heapprofile command from ICC
       if (command === 'trigger-heapprofile') {
         app.log.info({ command }, 'Received trigger-heapprofile command from ICC')
-        app.sendFlamegraphs({ profileType: 'heap' })
+        app.requestFlamegraphs({ profileType: 'heap' })
         return
       }

package/test/alerts.test.js CHANGED Viewed

@@ -90,9 +90,6 @@ test('should send alert when service becomes unhealthy', async (t) => {
     await icc.close()
   })
-  // Wait for the first flamegraph to be generated
-  await sleep(5000)
   // Manually trigger health event with unhealthy state
   const healthInfo = {
     id: 'main:0',
@@ -133,6 +130,9 @@ test('should send alert when service becomes unhealthy', async (t) => {
   assert.strictEqual(alertReceived.healthHistory[0].application, 'main')
   assert.strictEqual(alertReceived.healthHistory[0].service, 'main')
+  // Wait for flamegraph to be generated (duration is 2 seconds)
+  await sleep(2500)
   assert.ok(flamegraphReceived, 'Flamegraph should have been received')
   const profile = Profile.decode(flamegraphReceived)
@@ -526,8 +526,6 @@ test('should send alert when flamegraphs are disabled', async (t) => {
     await icc.close()
   })
-  await sleep(5000)
   // Manually trigger health event with unhealthy state
   const healthInfo = {
     id: 'main:0',
@@ -611,8 +609,6 @@ test('should send alert when failed to send a flamegraph', async (t) => {
     await icc.close()
   })
-  await sleep(5000)
   // Manually trigger health event with unhealthy state
   const healthInfo = {
     id: 'main:0',
@@ -799,9 +795,6 @@ test('should attach one flamegraph to multiple alerts', async (t) => {
     await icc.close()
   })
-  // Wait for the first flamegraph to be generated
-  await sleep(5000)
   // Manually trigger health event with unhealthy state
   const healthInfo = {
     id: 'main:0',
@@ -827,8 +820,8 @@ test('should attach one flamegraph to multiple alerts', async (t) => {
   await sleep(1000)
   emitHealthEvent(app, healthInfo)
-  // Wait for flamegraphs to be sent
-  await sleep(1000)
+  // Wait for flamegraph to be generated (duration is 5 seconds) and sent
+  await sleep(5500)
   assert.strictEqual(receivedAlerts.length, 2)
   const alert1 = receivedAlerts[0]
@@ -902,9 +895,6 @@ test('should send flamegraphs if attaching fails', async (t) => {
     await icc.close()
   })
-  // Wait for the first flamegraph to be generated
-  await sleep(5000)
   // Manually trigger health event with unhealthy state
   const healthInfo = {
     id: 'main:0',
@@ -930,8 +920,8 @@ test('should send flamegraphs if attaching fails', async (t) => {
   await sleep(1000)
   emitHealthEvent(app, healthInfo)
-  // Wait for flamegraphs to be sent
-  await sleep(1000)
+  // Wait for flamegraph to be generated (duration is 5 seconds) and sent
+  await sleep(5500)
   assert.strictEqual(receivedAlerts.length, 2)
   const alert1 = receivedAlerts[0]

package/test/health-signals.test.js CHANGED Viewed

@@ -58,9 +58,6 @@ test('should send health signals when service becomes unhealthy', async (t) => {
     await icc.close()
   })
-  // Wait for the first flamegraph to be generated
-  await sleep(5000)
   {
     const { statusCode } = await request('http://127.0.0.1:3042/custom-health-signal', {
       method: 'POST',
@@ -119,8 +116,8 @@ test('should send health signals when service becomes unhealthy', async (t) => {
     assert.ok(receivedSignal.timestamp > 0)
   }
-  // Wait for the second flamegraph to be generated
-  await sleep(2000)
+  // Wait for flamegraph to be generated (duration is 2 seconds)
+  await sleep(2500)
   // assert.strictEqual(receivedFlamegraphReqs.length, 1)