npm - @platformatic/watt-extra - Versions diffs - 1.7.0 → 1.7.1-alpha.2 - Mend

@platformatic/watt-extra 1.7.0 → 1.7.1-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/.claude/settings.local.json +5 -8
package/package.json +1 -1
package/plugins/alerts.js +11 -6
package/plugins/env.js +2 -3
package/plugins/flamegraphs.js +330 -205
package/plugins/health-signals.js +9 -8
package/plugins/update.js +2 -2
package/test/alerts.test.js +106 -26
package/test/fixtures/service-1/routes/root.cjs +13 -1
package/test/health-signals.test.js +166 -10
package/test/profiler.test.js +443 -0
package/test/trigger-flamegraphs.test.js +257 -416

package/plugins/flamegraphs.js CHANGED Viewed

@@ -1,146 +1,234 @@
 'use strict'
-import { setTimeout as sleep } from 'node:timers/promises'
 import { request } from 'undici'
-async function flamegraphs (app, _opts) {
-  const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
-  const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
-  const flamegraphsELUThreshold = app.env.PLT_FLAMEGRAPHS_ELU_THRESHOLD
-  const flamegraphsGracePeriod = app.env.PLT_FLAMEGRAPHS_GRACE_PERIOD
-  const flamegraphsAttemptTimeout = app.env.PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT
-  const flamegraphsCacheCleanupInterval = app.env.PLT_FLAMEGRAPHS_CACHE_CLEANUP_INTERVAL
+export class Profiler {
+  #workerId
+  #type
+  #duration
+  #profileOptions
+  #runtime
+  #log
+  #requests
+  #isProfiling
+  #onProfile
+  #getProfileInterval
+  #stopProfileTimeout
+  constructor (options = {}) {
+    const { type, duration, workerId, sourceMaps, app, onProfile } = options
+    if (type !== 'cpu' && type !== 'heap') {
+      throw new Error('Invalid Profiler type. Must be either "cpu" or "heap"')
+    }
+    if (typeof duration !== 'number') {
+      throw new Error('Invalid Profiler duration. Must be a number')
+    }
+    if (typeof workerId !== 'string') {
+      throw new Error('Invalid Worker ID. Must be a string')
+    }
+    if (!workerId.includes(':')) {
+      throw new Error('Worker ID must include the service ID and worker index')
+    }
+    if (typeof onProfile !== 'function') {
+      throw new Error('Invalid onProfile handler. Must be a function')
+    }
-  const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
-  const eluThreshold = parseFloat(flamegraphsELUThreshold)
-  const gracePeriod = parseInt(flamegraphsGracePeriod)
-  const attemptTimeout = Math.min(parseInt(flamegraphsAttemptTimeout), durationMillis)
-  const maxAttempts = Math.ceil(durationMillis / attemptTimeout) + 1
-  const cacheCleanupInterval = parseInt(flamegraphsCacheCleanupInterval)
+    this.#type = type
+    this.#duration = duration
+    this.#workerId = workerId
+    this.#onProfile = onProfile
-  let workerStartedListener = null
+    this.#profileOptions = {
+      type,
+      durationMillis: duration,
+      sourceMaps: sourceMaps ?? false
+    }
-  const startProfilingOnWorker = async (runtime, workerFullId, logContext = {}) => {
-    await sleep(gracePeriod)
+    this.#requests = []
+    this.#isProfiling = false
-    // Get application details to read service-level sourceMaps setting
-    const appDetails = await runtime.getApplicationDetails(workerFullId)
-    const sourceMaps = appDetails.sourceMaps ?? false
+    this.#runtime = app.watt.runtime
+    this.#log = app.log.child({
+      workerId: this.#workerId,
+      profilerType: this.#type
+    })
+  }
-    try {
-      // Start CPU profiling
-      await runtime.sendCommandToApplication(
-        workerFullId,
-        'startProfiling',
-        { durationMillis, eluThreshold, type: 'cpu', sourceMaps }
-      )
+  get workerId () {
+    return this.#workerId
+  }
-      // Start HEAP profiling
-      await runtime.sendCommandToApplication(
-        workerFullId,
-        'startProfiling',
-        { durationMillis, eluThreshold, type: 'heap', sourceMaps }
-      )
-    } catch (err) {
-      app.log.error({ err, ...logContext }, 'Failed to start profiling')
-      throw err
+  get isProfiling () {
+    return this.#isProfiling
+  }
+  async requestProfile (request = {}) {
+    request.timestamp ??= Date.now()
+    this.#requests.push(request)
+    this.#unscheduleStopProfiling()
+    if (!this.#isProfiling) {
+      this.#startProfilingLoop()
     }
   }
-  app.setupFlamegraphs = async () => {
-    if (isFlamegraphsDisabled) {
-      app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
+  async stop () {
+    process._rawDebug('--------PROFILER.STOP-------')
+    if (this.#getProfileInterval) {
+      clearInterval(this.#getProfileInterval)
+      this.#getProfileInterval = null
+    }
+    if (this.#stopProfileTimeout) {
+      clearTimeout(this.#stopProfileTimeout)
+      this.#stopProfileTimeout = null
+    }
+    if (this.#isProfiling) {
+      const requests = this.#getProfileRequests()
+      try {
+        const profile = await this.#stopProfiling()
+        if (requests.length > 0) {
+          this.#onProfile(null, profile, requests)
+        }
+      } catch (err) {
+        this.#log.error({ err }, 'Failed to stop profiling')
+        if (requests.length > 0) {
+          this.#onProfile(err, null, requests)
+        }
+      }
+    }
+  }
+  async #startProfilingLoop () {
+    try {
+      await this.#startProfiling()
+    } catch (err) {
+      this.#log.error({ err }, 'Failed to start profiling')
+      const requests = this.#getProfileRequests()
+      this.#onProfile(err, null, requests)
       return
     }
-    app.log.info('Start profiling services')
+    this.#getProfileInterval = setInterval(
+      () => this.#processProfile(),
+      this.#duration
+    ).unref()
+  }
-    const runtime = app.watt.runtime
-    const workers = await runtime.getWorkers()
+  async #processProfile () {
+    try {
+      const profile = await this.#getProfile()
+      const requests = this.#getProfileRequests(profile.timestamp)
+      this.#onProfile(null, profile, requests)
+    } catch (err) {
+      this.#log.error({ err }, 'Failed to generate a profile')
+      const requests = this.#getProfileRequests()
+      this.#onProfile(err, null, requests)
+    }
-    const promises = []
-    for (const [workerFullId, workerInfo] of Object.entries(workers)) {
-      if (workerInfo.status === 'started') {
-        const promise = startProfilingOnWorker(runtime, workerFullId, { workerFullId })
-        promises.push(promise)
-      }
+    if (this.#requests.length === 0) {
+      this.#scheduleStopProfiling()
     }
+  }
-    const results = await Promise.allSettled(promises)
-    for (const result of results) {
-      if (result.status === 'rejected') {
-        app.log.error({ result }, 'Failed to start profiling')
-      }
+  #scheduleStopProfiling () {
+    // Stop profiling after the duration/2 if there are no more requests
+    this.#stopProfileTimeout = setTimeout(
+      () => this.stop(),
+      this.#duration / 2
+    ).unref()
+  }
+  #unscheduleStopProfiling () {
+    if (this.#stopProfileTimeout) {
+      clearTimeout(this.#stopProfileTimeout)
+      this.#stopProfileTimeout = null
     }
+  }
-    // Listen for new workers starting and start profiling on them
-    workerStartedListener = ({ application, worker }) => {
-      if (isFlamegraphsDisabled) {
-        return
-      }
+  async #startProfiling () {
+    this.#isProfiling = true
+    this.#log.info('Starting profiling')
-      const workerFullId = [application, worker].join(':')
-      app.log.info({ application, worker }, 'Starting profiling on new worker')
+    await this.#runtime.sendCommandToApplication(
+      this.#workerId, 'startProfiling', this.#profileOptions
+    )
+  }
+  async #stopProfiling () {
+    this.#isProfiling = false
+    this.#log.info('Stopping profiling')
-      startProfilingOnWorker(runtime, workerFullId, { application, worker }).catch(() => {
-        // Error already logged in startProfilingOnWorker
-      })
+    try {
+      const profile = await this.#runtime.sendCommandToApplication(
+        this.#workerId, 'stopProfiling', this.#profileOptions
+      )
+      return profile
+    } catch (err) {
+      // Ignore errors if the app is already closing
+      this.#log.debug({ err }, 'Failed to stop profiling')
     }
-    runtime.on('application:worker:started', workerStartedListener)
+  }
+  async #getProfile () {
+    this.#log.info('Getting profile from worker')
-    setInterval(cleanupFlamegraphsCache, cacheCleanupInterval).unref()
+    const [state, profile] = await Promise.all([
+      this.#runtime.sendCommandToApplication(this.#workerId, 'getProfilingState', { type: this.#type }),
+      this.#runtime.sendCommandToApplication(this.#workerId, 'getLastProfile', { type: this.#type })
+    ])
+    return { data: profile, timestamp: state.latestProfileTimestamp }
   }
-  app.cleanupFlamegraphs = async () => {
-    if (workerStartedListener && app.watt?.runtime) {
-      app.watt.runtime.removeListener('application:worker:started', workerStartedListener)
-      workerStartedListener = null
+  #getProfileRequests (profileTimestamp) {
+    if (profileTimestamp === undefined) {
+      const requests = this.#requests
+      this.#requests = []
+      return requests
     }
-    // Explicitly stop all active profiling sessions to avoid memory corruption
-    if (!isFlamegraphsDisabled && app.watt?.runtime) {
-      try {
-        const workers = await app.watt.runtime.getWorkers()
-        const stopPromises = []
-        for (const workerFullId of Object.keys(workers)) {
-          // Stop both CPU and heap profiling on each worker
-          stopPromises.push(
-            app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'cpu' })
-              .catch(err => {
-                // Ignore errors if profiling wasn't running
-                if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
-                  app.log.warn({ err, workerFullId }, 'Failed to stop CPU profiling')
-                }
-              })
-          )
-          stopPromises.push(
-            app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'heap' })
-              .catch(err => {
-                // Ignore errors if profiling wasn't running
-                if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
-                  app.log.warn({ err, workerFullId }, 'Failed to stop heap profiling')
-                }
-              })
-          )
-        }
-        await Promise.all(stopPromises)
-        // Small delay to ensure native cleanup completes
-        await sleep(100)
-      } catch (err) {
-        app.log.warn({ err }, 'Failed to stop profiling during cleanup')
+    let processedIndex = 0
+    for (let i = 0; i < this.#requests.length; i++) {
+      if (this.#requests[i].timestamp <= profileTimestamp) {
+        processedIndex = i + 1
       }
     }
+    return this.#requests.splice(0, processedIndex)
   }
+}
-  const profilesByWorkerId = {}
+async function flamegraphs (app, _opts) {
+  const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
+  const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
+  const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
-  app.sendFlamegraphs = async (options = {}) => {
+  const profilers = {}
+  const profilersConfigs = {}
+  const profilersPauseReqs = {}
+  app.setupFlamegraphs = async () => {
     if (isFlamegraphsDisabled) {
-      app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
+      app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
       return
     }
-    let { workerIds, alertId, profileType = 'cpu' } = options
+    const runtime = app.watt.runtime
+    const { applications } = await runtime.getApplications()
+    for (const application of applications) {
+      const appDetails = await runtime.getApplicationDetails(application.id)
+      const sourceMaps = appDetails.sourceMaps ?? false
+      profilersConfigs[application.id] = { durationMillis, sourceMaps }
+    }
+  }
+  app.requestFlamegraphs = async (options = {}) => {
+    process._rawDebug('--------REQUEST PROFILING-------', options)
+    if (isFlamegraphsDisabled) {
+      app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
+      return
+    }
     const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
     if (!scalerUrl) {
@@ -150,118 +238,147 @@ async function flamegraphs (app, _opts) {
     const runtime = app.watt.runtime
-    if (!workerIds) {
-      const { applications } = await runtime.getApplications()
-      workerIds = applications.map(app => app.id)
-    }
+    let { serviceIds, alertId, profileType = 'cpu' } = options
-    cleanupFlamegraphsCache()
+    const servicesWorkers = {}
+    const workers = await runtime.getWorkers()
-    const uploadPromises = workerIds.map(async (workerId) => {
-      const serviceId = workerId.split(':')[0]
-      const profileKey = `${workerId}:${profileType}`
+    for (const workerId in workers) {
+      const workerInfo = workers[workerId]
+      const serviceId = workerInfo.application
-      let profile = profilesByWorkerId[profileKey]
-      if (profile !== undefined) {
-        if (alertId) {
-          app.log.info(
-            { workerId, alertId }, 'Flamegraph will be attached to the alert'
-          )
-          profile.waitingAlerts.push(alertId)
-        }
+      servicesWorkers[serviceId] ??= []
+      servicesWorkers[serviceId].push(workerId)
+    }
-        if (profile.flamegraphId === null) {
-          app.log.info({ workerId }, 'Waiting for flamegraph to be generated and sent')
-          return
+    for (const serviceId in profilers) {
+      const workerProfilers = profilers[serviceId]
+      for (const profileType in workerProfilers) {
+        const profiler = workerProfilers[profileType]
+        const workerId = profiler.workerId
+        if (workers[workerId]) continue
+        if (profiler.isProfiling) {
+          profiler.stop()
         }
+        delete profilers[serviceId][profileType]
+      }
+    }
+    serviceIds ??= Object.keys(servicesWorkers)
+    for (const serviceId of serviceIds) {
+      const { isPaused, remainingTimeSec } = isProfilingPaused(serviceId)
+      if (isPaused) {
+        app.log.info(
+          { serviceId },
+          `Skipping service profiling, it is paused for ${remainingTimeSec}s`
+        )
+        process._rawDebug('--------SKIP PROFILING-------', serviceId)
+        continue
       }
-      if (profile === undefined) {
-        profile = {
+      profilers[serviceId] ??= {}
+      let profiler = profilers[serviceId][profileType]
+      if (!profiler) {
+        const workerId = servicesWorkers[serviceId][0]
+        const config = profilersConfigs[serviceId]
+        profiler = new Profiler({
+          app,
+          workerId,
           type: profileType,
-          data: null,
-          timestamp: null,
-          flamegraphId: null,
-          waitingAlerts: []
-        }
-        profilesByWorkerId[profileKey] = profile
+          duration: config.durationMillis,
+          sourceMaps: config.sourceMaps,
+          onProfile: createProfileHandler(scalerUrl, workerId, profileType)
+        })
+        profilers[serviceId][profileType] = profiler
+      }
-        const result = await getServiceFlamegraph(workerId, profileType)
-        if (!result || !(result.data instanceof Uint8Array)) {
-          app.log.error({ workerId }, 'Failed to get profile from service')
-          delete profilesByWorkerId[profileKey]
-          return
-        }
+      process._rawDebug('--------REQUEST PROFILING-------', serviceId)
+      profiler.requestProfile({ alertId })
+    }
+  }
+  // Method to be called when the worker ELU is very high
+  // to stop profiling and wait for app to go back to normal
+  app.pauseProfiling = async (options = {}) => {
+    process._rawDebug('--------PAUSE PROFILING-------', options)
+    if (isFlamegraphsDisabled) {
+      app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
+      return
+    }
+    const { serviceId, timeout } = options
+    profilersPauseReqs[serviceId] = { timestamp: timeout + Date.now() }
+    const serviceProfilers = profilers[serviceId]
+    if (!serviceProfilers) {
+      app.log.debug({ serviceId }, 'Skipping service profiling pause, no profilers found')
+      return
+    }
+    for (const profilerType in profilers[serviceId]) {
+      const profiler = profilers[serviceId][profilerType]
+      app.log.info({ serviceId, profilerType }, 'Pausing service profiling due to high ELU')
+      await profiler.stop()
+    }
+  }
+  function isProfilingPaused (serviceId) {
+    let isPaused = false
+    let remainingTimeSec = 0
+    const pauseReq = profilersPauseReqs[serviceId]
+    if (pauseReq) {
+      const now = Date.now()
+      isPaused = pauseReq.timestamp > now
+      remainingTimeSec = Math.round((pauseReq.timestamp - now) / 1000)
+      process._rawDebug('--------IS PROFILING PAUSED-------', isPaused, remainingTimeSec)
+    }
+    return { isPaused, remainingTimeSec }
+  }
+  function createProfileHandler (scalerUrl, workerId, profileType) {
+    const serviceId = workerId.split(':')[0]
-        profile.data = result.data
-        profile.timestamp = result.timestamp
+    return async (err, profile, requests) => {
+      process._rawDebug('--------PROFILER HANDLER-------', !!profile, requests)
+      if (err) {
+        app.log.error({ err }, 'Failed to generate a profile')
+        return
       }
-      if (profile.flamegraphId === null || !alertId) {
-        try {
-          const flamegraph = await sendServiceFlamegraph(
-            scalerUrl,
-            serviceId,
-            profile.data,
-            profileType,
-            alertId
-          )
-          profile.flamegraphId = flamegraph.id
-        } catch (err) {
-          app.log.error({ err, workerId, alertId, profileType }, 'Failed to send flamegraph')
-          delete profilesByWorkerId[profileKey]
-          return
+      const alertIds = []
+      for (const request of requests) {
+        if (request.alertId) {
+          alertIds.push(request.alertId)
         }
       }
-      const waitingAlerts = profile.waitingAlerts
-      if (waitingAlerts.length > 0) {
-        profile.waitingAlerts = []
-        await _attachFlamegraphToAlerts(
+      try {
+        const alertId = alertIds.shift()
+        const flamegraph = await sendServiceFlamegraph(
           scalerUrl,
           serviceId,
-          profile.flamegraphId,
           profile.data,
-          profile.type,
-          waitingAlerts
-        )
-      }
-    })
-    await Promise.all(uploadPromises)
-  }
-  async function getServiceFlamegraph (workerId, profileType, attempt = 1) {
-    const runtime = app.watt.runtime
-    app.log.info({ workerId, attempt, maxAttempts, attemptTimeout }, 'Getting profile from worker')
-    try {
-      const [state, profile] = await Promise.all([
-        runtime.sendCommandToApplication(workerId, 'getProfilingState', { type: profileType }),
-        runtime.sendCommandToApplication(workerId, 'getLastProfile', { type: profileType })
-      ])
-      return { data: profile, timestamp: state.latestProfileTimestamp }
-    } catch (err) {
-      if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
-        app.log.info(
-          { workerId, attempt, maxAttempts, attemptTimeout },
-          'No profile available for the service. Waiting for profiling to complete.'
+          profileType,
+          alertId
         )
-        if (attempt <= maxAttempts) {
-          await sleep(attemptTimeout)
-          return getServiceFlamegraph(workerId, profileType, attempt + 1)
-        }
-      } else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
-        app.log.info({ workerId }, 'ELU low, CPU profiling not active')
-      } else {
-        app.log.warn({ err, workerId }, 'Failed to get profile from a worker')
-        const [serviceId, workerIndex] = workerId.split(':')
-        if (workerIndex) {
-          app.log.warn('Worker not available, trying to get profile from another worker')
-          return getServiceFlamegraph(serviceId, profileType)
+        if (alertIds.length > 0) {
+          await _attachFlamegraphToAlerts(
+            scalerUrl,
+            serviceId,
+            flamegraph.id,
+            profile.data,
+            profileType,
+            alertIds
+          )
         }
+      } catch (err) {
+        app.log.error({ err, workerId }, 'Failed to send flamegraph')
       }
     }
   }
@@ -269,7 +386,7 @@ async function flamegraphs (app, _opts) {
   async function sendServiceFlamegraph (scalerUrl, serviceId, profile, profileType, alertId) {
     const podId = app.instanceId
     const url = `${scalerUrl}/pods/${podId}/services/${serviceId}/flamegraph`
-    app.log.info({ serviceId, podId, profileType }, 'Sending flamegraph')
+    app.log.info({ serviceId, podId, profileType, alertId }, 'Sending flamegraph')
     const query = { profileType }
     if (alertId) {
@@ -293,8 +410,14 @@ async function flamegraphs (app, _opts) {
       throw new Error(`Failed to send flamegraph: ${error}`)
     }
-    const response = await body.json()
-    return response
+    const flamegraph = await body.json()
+    app.log.info(
+      { serviceId, podId, profileType, flamegraph },
+      'Flamegraph successfully stored'
+    )
+    return flamegraph
   }
   // Function that supports ICC that doesn't have attach flamegraph API
@@ -367,15 +490,17 @@ async function flamegraphs (app, _opts) {
     }
   }
-  function cleanupFlamegraphsCache () {
-    const now = Date.now()
-    for (const profileKey of Object.keys(profilesByWorkerId)) {
-      const timestamp = profilesByWorkerId[profileKey]?.timestamp
-      if (timestamp && now - timestamp > durationMillis) {
-        delete profilesByWorkerId[profileKey]
+  app.cleanupFlamegraphs = async () => {
+    // Stop all tracked profilers in parallel
+    const stopPromises = []
+    for (const serviceId in profilers) {
+      const serviceProfilers = profilers[serviceId]
+      for (const profileType in serviceProfilers) {
+        const profiler = serviceProfilers[profileType]
+        stopPromises.push(profiler.stop())
       }
     }
+    await Promise.all(stopPromises)
   }
 }

package/plugins/health-signals.js CHANGED Viewed

@@ -51,6 +51,8 @@ async function healthSignals (app, _opts) {
       return
     }
+    const pauseEluThreshold = app.env.PLT_FLAMEGRAPHS_PAUSE_ELU_TRESHOLD
+    const pauseTimeout = app.env.PLT_FLAMEGRAPHS_PAUSE_TIMEOUT
     const eluThreshold = app.env.PLT_ELU_HEALTH_SIGNAL_THRESHOLD
     let heapThreshold = app.env.PLT_HEAP_HEALTH_SIGNAL_THRESHOLD
@@ -88,6 +90,10 @@ async function healthSignals (app, _opts) {
       const { elu, heapUsed, heapTotal } = currentHealth
+      if (elu >= pauseEluThreshold) {
+        app.pauseProfiling({ serviceId, timeout: pauseTimeout })
+      }
       if (elu > eluThreshold) {
         healthSignals.push({
           type: 'elu',
@@ -183,15 +189,10 @@ async function healthSignals (app, _opts) {
       app.log.error({ error }, 'Failed to send health signals to scaler')
     }
-    const alert = await body.json()
+    const response = await body.json()
-    app.sendFlamegraphs({
-      serviceIds: [serviceId],
-      workerIds: [workerId],
-      alertId: alert.id
-    }).catch(err => {
-      app.log.error({ err }, 'Failed to send a flamegraph')
-    })
+    app.requestFlamegraphs({ serviceIds: [serviceId], alertId: response.alertId })
+      .catch(err => app.log.error({ err }, 'Failed to send a flamegraph'))
   }
 }