npm - @platformatic/watt-extra - Versions diffs - 1.6.3-alpha.5 → 1.7.0 - Mend

@platformatic/watt-extra 1.6.3-alpha.5 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/.claude/settings.local.json +8 -5
package/package.json +1 -1
package/plugins/alerts.js +25 -1
package/plugins/env.js +2 -1
package/plugins/flamegraphs.js +210 -244
package/plugins/health-signals.js +3 -5
package/plugins/update.js +2 -2
package/test/alerts.test.js +179 -7
package/test/health-signals.test.js +5 -2
package/test/helper.js +1 -0
package/test/trigger-flamegraphs.test.js +439 -187
package/test/profiler.test.js +0 -443

package/.claude/settings.local.json CHANGED Viewed

@@ -1,11 +1,14 @@
 {
   "permissions": {
     "allow": [
-      "Bash(node --test-only:*)",
-      "Bash(node --test:*)",
-      "Bash(for i in {1..3})",
-      "Bash(do echo \"=== Run $i ===\")",
-      "Bash(done)"
+      "Read(//work/workspaces/workspace-platformatic/platformatic/**)",
+      "Bash(npx borp:*)",
+      "Bash(timeout 30 npx borp -c 1 --timeout=20000 ./test/trigger-flamegraphs.test.js)",
+      "Bash(xargs cat:*)",
+      "Bash(pnpm install)",
+      "Bash(find:*)",
+      "Bash(cat:*)",
+      "WebFetch(domain:github.com)"
     ],
     "deny": [],
     "ask": []

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@platformatic/watt-extra",
-  "version": "1.6.3-alpha.5",
+  "version": "1.7.0",
   "description": "The Platformatic runtime manager",
   "type": "module",
   "scripts": {

package/plugins/alerts.js CHANGED Viewed

@@ -8,6 +8,7 @@ async function alerts (app, _opts) {
     app.instanceConfig?.scaler?.alertRetentionWindow || 10 * 1000
   const lastServicesAlertTime = {}
+  const workerStartTimes = new Map() // Track per-worker start times for grace period
   async function setupAlerts () {
     const scalerAlgorithmVersion = app.instanceConfig?.scaler?.version ?? 'v1'
@@ -17,6 +18,9 @@ async function alerts (app, _opts) {
     }
     app.log.info('Setting up v1 scaler alerts')
+    // Grace period during which alerts are suppressed per-worker.
+    const gracePeriodMs = app.env.PLT_ALERTS_GRACE_PERIOD_SEC * 1000
     // Skip alerts setup if ICC is not configured
     if (!app.env.PLT_ICC_URL) {
       app.log.info('PLT_ICC_URL not set, skipping alerts setup')
@@ -33,6 +37,18 @@ async function alerts (app, _opts) {
       return
     }
+    // Default start time for workers that started before the listener was registered
+    const pluginStartTime = Date.now()
+    // Listen for worker start events to track start times
+    runtime.on('application:worker:started', (workerInfo) => {
+      const workerId = workerInfo?.id
+      if (workerId) {
+        workerStartTimes.set(workerId, Date.now())
+        app.log.debug({ workerId }, 'Worker started, tracking for grace period')
+      }
+    })
     const processHealthInfo = async (healthInfo) => {
       if (!healthInfo) {
         app.log.error('No health info received')
@@ -55,6 +71,14 @@ async function alerts (app, _opts) {
         healthCache.splice(0, validIndex)
       }
+      // Skip sending alerts during worker's grace period.
+      // Use plugin start time as default for workers that started before the listener.
+      const workerStartTime = workerStartTimes.get(workerId) ?? pluginStartTime
+      if (timestamp - workerStartTime < gracePeriodMs) {
+        app.log.debug({ workerId }, 'Skipping alert during worker grace period')
+        return
+      }
       // healthInfo is an object with the following structure:
       // id: "service-1"
       // service: "service-1"
@@ -112,7 +136,7 @@ async function alerts (app, _opts) {
         const alert = await body.json()
-        app.requestFlamegraphs({
+        app.sendFlamegraphs({
           workerIds: [workerId],
           alertId: alert.id
         }).catch(err => {

package/plugins/env.js CHANGED Viewed

@@ -26,7 +26,8 @@ const schema = {
     PLT_JWT_EXPIRATION_OFFSET_SEC: { type: 'number', default: 60 },
     PLT_UPDATES_RECONNECT_INTERVAL_SEC: { type: 'number', default: 1 },
     PLT_ELU_HEALTH_SIGNAL_THRESHOLD: { type: 'number', default: 0.8 },
-    PLT_HEAP_HEALTH_SIGNAL_THRESHOLD: { type: ['number', 'string'], default: '4GB' }
+    PLT_HEAP_HEALTH_SIGNAL_THRESHOLD: { type: ['number', 'string'], default: '4GB' },
+    PLT_ALERTS_GRACE_PERIOD_SEC: { type: 'number', default: 30 }
   }
 }

package/plugins/flamegraphs.js CHANGED Viewed

@@ -1,207 +1,147 @@
 'use strict'
+import { setTimeout as sleep } from 'node:timers/promises'
 import { request } from 'undici'
-export class Profiler {
-  #workerId
-  #type
-  #duration
-  #profileOptions
-  #runtime
-  #log
-  #requests
-  #isProfiling
-  #onProfile
-  #getProfileInterval
-  #stopProfileTimeout
-  constructor (options = {}) {
-    const { type, duration, workerId, sourceMaps, app, onProfile } = options
-    if (type !== 'cpu' && type !== 'heap') {
-      throw new Error('Invalid Profiler type. Must be either "cpu" or "heap"')
-    }
-    if (typeof duration !== 'number') {
-      throw new Error('Invalid Profiler duration. Must be a number')
-    }
-    if (typeof workerId !== 'string') {
-      throw new Error('Invalid Worker ID. Must be a string')
-    }
-    if (!workerId.includes(':')) {
-      throw new Error('Worker ID must include the service ID and worker index')
-    }
-    if (typeof onProfile !== 'function') {
-      throw new Error('Invalid onProfile handler. Must be a function')
-    }
-    this.#type = type
-    this.#duration = duration
-    this.#workerId = workerId
-    this.#onProfile = onProfile
+async function flamegraphs (app, _opts) {
+  const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
+  const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
+  const flamegraphsELUThreshold = app.env.PLT_FLAMEGRAPHS_ELU_THRESHOLD
+  const flamegraphsGracePeriod = app.env.PLT_FLAMEGRAPHS_GRACE_PERIOD
+  const flamegraphsAttemptTimeout = app.env.PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT
+  const flamegraphsCacheCleanupInterval = app.env.PLT_FLAMEGRAPHS_CACHE_CLEANUP_INTERVAL
-    this.#profileOptions = {
-      type,
-      durationMillis: duration,
-      sourceMaps: sourceMaps ?? false
-    }
+  const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
+  const eluThreshold = parseFloat(flamegraphsELUThreshold)
+  const gracePeriod = parseInt(flamegraphsGracePeriod)
+  const attemptTimeout = Math.min(parseInt(flamegraphsAttemptTimeout), durationMillis)
+  const maxAttempts = Math.ceil(durationMillis / attemptTimeout) + 1
+  const cacheCleanupInterval = parseInt(flamegraphsCacheCleanupInterval)
-    this.#requests = []
-    this.#isProfiling = false
+  let workerStartedListener = null
-    this.#runtime = app.watt.runtime
-    this.#log = app.log.child({
-      workerId: this.#workerId,
-      profilerType: this.#type
-    })
-  }
+  const startProfilingOnWorker = async (runtime, workerFullId, logContext = {}) => {
+    await sleep(gracePeriod)
-  async requestProfile (request = {}) {
-    process._rawDebug('--------REQUEST--------', request)
-    request.timestamp ??= Date.now()
-    this.#requests.push(request)
-    this.#unscheduleStopProfiling()
+    // Get application details to read service-level sourceMaps setting
+    const appDetails = await runtime.getApplicationDetails(workerFullId)
+    const sourceMaps = appDetails.sourceMaps ?? false
-    if (!this.#isProfiling) {
-      this.#startProfilingLoop()
-    }
-  }
+    try {
+      // Start CPU profiling
+      await runtime.sendCommandToApplication(
+        workerFullId,
+        'startProfiling',
+        { durationMillis, eluThreshold, type: 'cpu', sourceMaps }
+      )
-  async stop () {
-    if (this.#getProfileInterval) {
-      clearInterval(this.#getProfileInterval)
-      this.#getProfileInterval = null
-    }
-    if (this.#stopProfileTimeout) {
-      clearTimeout(this.#stopProfileTimeout)
-      this.#stopProfileTimeout = null
-    }
-    if (this.#isProfiling) {
-      await this.#stopProfiling()
+      // Start HEAP profiling
+      await runtime.sendCommandToApplication(
+        workerFullId,
+        'startProfiling',
+        { durationMillis, eluThreshold, type: 'heap', sourceMaps }
+      )
+    } catch (err) {
+      app.log.error({ err, ...logContext }, 'Failed to start profiling')
+      throw err
     }
   }
-  async #startProfilingLoop () {
-    try {
-      await this.#startProfiling()
-    } catch (err) {
-      this.#log.error({ err }, 'Failed to start profiling')
-      const requests = this.#getProfileRequests(Date.now())
-      this.#onProfile(err, null, requests)
+  app.setupFlamegraphs = async () => {
+    if (isFlamegraphsDisabled) {
+      app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
       return
     }
-    this.#getProfileInterval = setInterval(
-      () => this.#processProfile(),
-      this.#duration
-    ).unref()
-  }
+    app.log.info('Start profiling services')
-  async #processProfile () {
-    try {
-      const profile = await this.#getProfile()
-      const requests = this.#getProfileRequests(profile.timestamp)
-      this.#onProfile(null, profile, requests)
-    } catch (err) {
-      this.#log.error({ err }, 'Failed to generate a profile')
-      const requests = this.#getProfileRequests(Date.now())
-      this.#onProfile(err, null, requests)
-    }
+    const runtime = app.watt.runtime
+    const workers = await runtime.getWorkers()
-    if (this.#requests.length === 0) {
-      this.#scheduleStopProfiling()
+    const promises = []
+    for (const [workerFullId, workerInfo] of Object.entries(workers)) {
+      if (workerInfo.status === 'started') {
+        const promise = startProfilingOnWorker(runtime, workerFullId, { workerFullId })
+        promises.push(promise)
+      }
     }
-  }
-  #scheduleStopProfiling () {
-    // Stop profiling after the duration/2 if there are no more requests
-    this.#stopProfileTimeout = setTimeout(
-      () => this.stop(),
-      this.#duration / 2
-    ).unref()
-  }
-  #unscheduleStopProfiling () {
-    if (this.#stopProfileTimeout) {
-      clearTimeout(this.#stopProfileTimeout)
-      this.#stopProfileTimeout = null
+    const results = await Promise.allSettled(promises)
+    for (const result of results) {
+      if (result.status === 'rejected') {
+        app.log.error({ result }, 'Failed to start profiling')
+      }
     }
-  }
-  async #startProfiling () {
-    this.#isProfiling = true
-    this.#log.info('Starting profiling')
-    await this.#runtime.sendCommandToApplication(
-      this.#workerId, 'startProfiling', this.#profileOptions
-    )
-  }
+    // Listen for new workers starting and start profiling on them
+    workerStartedListener = ({ application, worker }) => {
+      if (isFlamegraphsDisabled) {
+        return
+      }
-  async #stopProfiling () {
-    this.#isProfiling = false
-    this.#log.info('Stopping profiling')
+      const workerFullId = [application, worker].join(':')
+      app.log.info({ application, worker }, 'Starting profiling on new worker')
-    try {
-      await this.#runtime.sendCommandToApplication(
-        this.#workerId, 'stopProfiling', this.#profileOptions
-      )
-    } catch (err) {
-      // Ignore errors if the app is already closing
-      this.#log.debug({ err }, 'Failed to stop profiling')
+      startProfilingOnWorker(runtime, workerFullId, { application, worker }).catch(() => {
+        // Error already logged in startProfilingOnWorker
+      })
     }
-  }
-  async #getProfile () {
-    this.#log.info('Getting profile from worker')
-    const [state, profile] = await Promise.all([
-      this.#runtime.sendCommandToApplication(this.#workerId, 'getProfilingState', { type: this.#type }),
-      this.#runtime.sendCommandToApplication(this.#workerId, 'getLastProfile', { type: this.#type })
-    ])
-    return { data: profile, timestamp: state.latestProfileTimestamp }
-  }
+    runtime.on('application:worker:started', workerStartedListener)
-  #getProfileRequests (profileTimestamp) {
-    let processedIndex = 0
-    for (let i = 0; i < this.#requests.length; i++) {
-      if (this.#requests[i].timestamp <= profileTimestamp) {
-        processedIndex = i + 1
-      }
-    }
-    return this.#requests.splice(0, processedIndex)
+    setInterval(cleanupFlamegraphsCache, cacheCleanupInterval).unref()
   }
-}
-async function flamegraphs (app, _opts) {
-  const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
-  const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
-  const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
-  const profilers = {}
-  const profilersConfigs = {}
-  app.setupFlamegraphs = async () => {
-    if (isFlamegraphsDisabled) {
-      app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
-      return
+  app.cleanupFlamegraphs = async () => {
+    if (workerStartedListener && app.watt?.runtime) {
+      app.watt.runtime.removeListener('application:worker:started', workerStartedListener)
+      workerStartedListener = null
     }
-    const runtime = app.watt.runtime
-    const { applications } = await runtime.getApplications()
-    for (const application of applications) {
-      const appDetails = await runtime.getApplicationDetails(application.id)
-      const sourceMaps = appDetails.sourceMaps ?? false
-      profilersConfigs[application.id] = { durationMillis, sourceMaps }
+    // Explicitly stop all active profiling sessions to avoid memory corruption
+    if (!isFlamegraphsDisabled && app.watt?.runtime) {
+      try {
+        const workers = await app.watt.runtime.getWorkers()
+        const stopPromises = []
+        for (const workerFullId of Object.keys(workers)) {
+          // Stop both CPU and heap profiling on each worker
+          stopPromises.push(
+            app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'cpu' })
+              .catch(err => {
+                // Ignore errors if profiling wasn't running
+                if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
+                  app.log.warn({ err, workerFullId }, 'Failed to stop CPU profiling')
+                }
+              })
+          )
+          stopPromises.push(
+            app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'heap' })
+              .catch(err => {
+                // Ignore errors if profiling wasn't running
+                if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
+                  app.log.warn({ err, workerFullId }, 'Failed to stop heap profiling')
+                }
+              })
+          )
+        }
+        await Promise.all(stopPromises)
+        // Small delay to ensure native cleanup completes
+        await sleep(100)
+      } catch (err) {
+        app.log.warn({ err }, 'Failed to stop profiling during cleanup')
+      }
     }
   }
-  app.requestFlamegraphs = async (options = {}) => {
+  const profilesByWorkerId = {}
+  app.sendFlamegraphs = async (options = {}) => {
     if (isFlamegraphsDisabled) {
       app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
       return
     }
+    let { workerIds, alertId, profileType = 'cpu' } = options
     const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
     if (!scalerUrl) {
       app.log.error('No scaler URL found in ICC services, cannot send flamegraph')
@@ -210,91 +150,118 @@ async function flamegraphs (app, _opts) {
     const runtime = app.watt.runtime
-    let { workerIds, alertId, profileType = 'cpu' } = options
-    const servicesWorkers = {}
-    const workers = await runtime.getWorkers()
-    for (const workerId in workers) {
-      const workerInfo = workers[workerId]
-      const serviceId = workerInfo.application
-      servicesWorkers[serviceId] ??= []
-      servicesWorkers[serviceId].push(workerId)
+    if (!workerIds) {
+      const { applications } = await runtime.getApplications()
+      workerIds = applications.map(app => app.id)
     }
-    workerIds ??= Object.keys(servicesWorkers)
-    for (let workerId of workerIds) {
-      const [serviceId, workerIndex] = workerId.split(':')
-      if (workerIndex === undefined) {
-        workerId = servicesWorkers[serviceId][0]
-      }
-      if (workerId === undefined) {
-        app.log.error({ serviceId }, 'No worker found for an application')
-        continue
-      }
+    cleanupFlamegraphsCache()
+    const uploadPromises = workerIds.map(async (workerId) => {
+      const serviceId = workerId.split(':')[0]
       const profileKey = `${workerId}:${profileType}`
-      let profiler = profilers[profileKey]
-      if (!profiler) {
-        const config = profilersConfigs[serviceId]
-        profiler = new Profiler({
-          app,
-          workerId,
-          type: profileType,
-          duration: config.durationMillis,
-          sourceMaps: config.sourceMaps,
-          onProfile: createProfileHandler(scalerUrl, workerId, profileType)
-        })
-        profilers[profileKey] = profiler
+      let profile = profilesByWorkerId[profileKey]
+      if (profile !== undefined) {
+        if (alertId) {
+          app.log.info(
+            { workerId, alertId }, 'Flamegraph will be attached to the alert'
+          )
+          profile.waitingAlerts.push(alertId)
+        }
+        if (profile.flamegraphId === null) {
+          app.log.info({ workerId }, 'Waiting for flamegraph to be generated and sent')
+          return
+        }
       }
-      profiler.requestProfile({ alertId })
-    }
-  }
+      if (profile === undefined) {
+        profile = {
+          type: profileType,
+          data: null,
+          timestamp: null,
+          flamegraphId: null,
+          waitingAlerts: []
+        }
+        profilesByWorkerId[profileKey] = profile
-  function createProfileHandler (scalerUrl, workerId, profileType) {
-    const serviceId = workerId.split(':')[0]
+        const result = await getServiceFlamegraph(workerId, profileType)
+        if (!result || !(result.data instanceof Uint8Array)) {
+          app.log.error({ workerId }, 'Failed to get profile from service')
+          delete profilesByWorkerId[profileKey]
+          return
+        }
-    return async (err, profile, requests) => {
-      if (err) {
-        app.log.error({ err }, 'Failed to generate a profile')
-        return
+        profile.data = result.data
+        profile.timestamp = result.timestamp
       }
-      const alertIds = []
-      for (const request of requests) {
-        if (request.alertId) {
-          alertIds.push(request.alertId)
+      if (profile.flamegraphId === null || !alertId) {
+        try {
+          const flamegraph = await sendServiceFlamegraph(
+            scalerUrl,
+            serviceId,
+            profile.data,
+            profileType,
+            alertId
+          )
+          profile.flamegraphId = flamegraph.id
+        } catch (err) {
+          app.log.error({ err, workerId, alertId, profileType }, 'Failed to send flamegraph')
+          delete profilesByWorkerId[profileKey]
+          return
         }
       }
-      process._rawDebug('--------ALERT IDS--------', alertIds)
-      try {
-        const alertId = alertIds.shift()
-        const flamegraph = await sendServiceFlamegraph(
+      const waitingAlerts = profile.waitingAlerts
+      if (waitingAlerts.length > 0) {
+        profile.waitingAlerts = []
+        await _attachFlamegraphToAlerts(
           scalerUrl,
           serviceId,
+          profile.flamegraphId,
           profile.data,
-          profileType,
-          alertId
+          profile.type,
+          waitingAlerts
         )
+      }
+    })
-        if (alertIds.length > 0) {
-          await _attachFlamegraphToAlerts(
-            scalerUrl,
-            serviceId,
-            flamegraph.id,
-            profile.data,
-            profileType,
-            alertIds
-          )
+    await Promise.all(uploadPromises)
+  }
+  async function getServiceFlamegraph (workerId, profileType, attempt = 1) {
+    const runtime = app.watt.runtime
+    app.log.info({ workerId, attempt, maxAttempts, attemptTimeout }, 'Getting profile from worker')
+    try {
+      const [state, profile] = await Promise.all([
+        runtime.sendCommandToApplication(workerId, 'getProfilingState', { type: profileType }),
+        runtime.sendCommandToApplication(workerId, 'getLastProfile', { type: profileType })
+      ])
+      return { data: profile, timestamp: state.latestProfileTimestamp }
+    } catch (err) {
+      if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
+        app.log.info(
+          { workerId, attempt, maxAttempts, attemptTimeout },
+          'No profile available for the service. Waiting for profiling to complete.'
+        )
+        if (attempt <= maxAttempts) {
+          await sleep(attemptTimeout)
+          return getServiceFlamegraph(workerId, profileType, attempt + 1)
+        }
+      } else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
+        app.log.info({ workerId }, 'ELU low, CPU profiling not active')
+      } else {
+        app.log.warn({ err, workerId }, 'Failed to get profile from a worker')
+        const [serviceId, workerIndex] = workerId.split(':')
+        if (workerIndex) {
+          app.log.warn('Worker not available, trying to get profile from another worker')
+          return getServiceFlamegraph(serviceId, profileType)
         }
-      } catch (err) {
-        app.log.error({ err, workerId }, 'Failed to send flamegraph')
       }
     }
   }
@@ -302,7 +269,7 @@ async function flamegraphs (app, _opts) {
   async function sendServiceFlamegraph (scalerUrl, serviceId, profile, profileType, alertId) {
     const podId = app.instanceId
     const url = `${scalerUrl}/pods/${podId}/services/${serviceId}/flamegraph`
-    app.log.info({ serviceId, podId, profileType, alertId }, 'Sending flamegraph')
+    app.log.info({ serviceId, podId, profileType }, 'Sending flamegraph')
     const query = { profileType }
     if (alertId) {
@@ -326,14 +293,8 @@ async function flamegraphs (app, _opts) {
       throw new Error(`Failed to send flamegraph: ${error}`)
     }
-    const flamegraph = await body.json()
-    app.log.info(
-      { serviceId, podId, profileType, flamegraph },
-      'Flamegraph successfully stored'
-    )
-    return flamegraph
+    const response = await body.json()
+    return response
   }
   // Function that supports ICC that doesn't have attach flamegraph API
@@ -406,10 +367,15 @@ async function flamegraphs (app, _opts) {
     }
   }
-  app.cleanupFlamegraphs = async () => {
-    // Stop all tracked profilers in parallel
-    const stopPromises = Object.values(profilers).map(profiler => profiler.stop())
-    await Promise.all(stopPromises)
+  function cleanupFlamegraphsCache () {
+    const now = Date.now()
+    for (const profileKey of Object.keys(profilesByWorkerId)) {
+      const timestamp = profilesByWorkerId[profileKey]?.timestamp
+      if (timestamp && now - timestamp > durationMillis) {
+        delete profilesByWorkerId[profileKey]
+      }
+    }
   }
 }

package/plugins/health-signals.js CHANGED Viewed

@@ -183,14 +183,12 @@ async function healthSignals (app, _opts) {
       app.log.error({ error }, 'Failed to send health signals to scaler')
     }
-    const response = await body.json()
+    const alert = await body.json()
-    process._rawDebug('--------SEND HEALTH SIGNALS--------', response)
-    app.requestFlamegraphs({
+    app.sendFlamegraphs({
       serviceIds: [serviceId],
       workerIds: [workerId],
-      alertId: response.alertId
+      alertId: alert.id
     }).catch(err => {
       app.log.error({ err }, 'Failed to send a flamegraph')
     })