@platformatic/watt-extra 1.6.3-alpha.5 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +8 -5
- package/package.json +1 -1
- package/plugins/alerts.js +25 -1
- package/plugins/env.js +2 -1
- package/plugins/flamegraphs.js +210 -244
- package/plugins/health-signals.js +3 -5
- package/plugins/update.js +2 -2
- package/test/alerts.test.js +179 -7
- package/test/health-signals.test.js +5 -2
- package/test/helper.js +1 -0
- package/test/trigger-flamegraphs.test.js +439 -187
- package/test/profiler.test.js +0 -443
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"permissions": {
|
|
3
3
|
"allow": [
|
|
4
|
-
"
|
|
5
|
-
"Bash(
|
|
6
|
-
"Bash(
|
|
7
|
-
"Bash(
|
|
8
|
-
"Bash(
|
|
4
|
+
"Read(//work/workspaces/workspace-platformatic/platformatic/**)",
|
|
5
|
+
"Bash(npx borp:*)",
|
|
6
|
+
"Bash(timeout 30 npx borp -c 1 --timeout=20000 ./test/trigger-flamegraphs.test.js)",
|
|
7
|
+
"Bash(xargs cat:*)",
|
|
8
|
+
"Bash(pnpm install)",
|
|
9
|
+
"Bash(find:*)",
|
|
10
|
+
"Bash(cat:*)",
|
|
11
|
+
"WebFetch(domain:github.com)"
|
|
9
12
|
],
|
|
10
13
|
"deny": [],
|
|
11
14
|
"ask": []
|
package/package.json
CHANGED
package/plugins/alerts.js
CHANGED
|
@@ -8,6 +8,7 @@ async function alerts (app, _opts) {
|
|
|
8
8
|
app.instanceConfig?.scaler?.alertRetentionWindow || 10 * 1000
|
|
9
9
|
|
|
10
10
|
const lastServicesAlertTime = {}
|
|
11
|
+
const workerStartTimes = new Map() // Track per-worker start times for grace period
|
|
11
12
|
|
|
12
13
|
async function setupAlerts () {
|
|
13
14
|
const scalerAlgorithmVersion = app.instanceConfig?.scaler?.version ?? 'v1'
|
|
@@ -17,6 +18,9 @@ async function alerts (app, _opts) {
|
|
|
17
18
|
}
|
|
18
19
|
app.log.info('Setting up v1 scaler alerts')
|
|
19
20
|
|
|
21
|
+
// Grace period during which alerts are suppressed per-worker.
|
|
22
|
+
const gracePeriodMs = app.env.PLT_ALERTS_GRACE_PERIOD_SEC * 1000
|
|
23
|
+
|
|
20
24
|
// Skip alerts setup if ICC is not configured
|
|
21
25
|
if (!app.env.PLT_ICC_URL) {
|
|
22
26
|
app.log.info('PLT_ICC_URL not set, skipping alerts setup')
|
|
@@ -33,6 +37,18 @@ async function alerts (app, _opts) {
|
|
|
33
37
|
return
|
|
34
38
|
}
|
|
35
39
|
|
|
40
|
+
// Default start time for workers that started before the listener was registered
|
|
41
|
+
const pluginStartTime = Date.now()
|
|
42
|
+
|
|
43
|
+
// Listen for worker start events to track start times
|
|
44
|
+
runtime.on('application:worker:started', (workerInfo) => {
|
|
45
|
+
const workerId = workerInfo?.id
|
|
46
|
+
if (workerId) {
|
|
47
|
+
workerStartTimes.set(workerId, Date.now())
|
|
48
|
+
app.log.debug({ workerId }, 'Worker started, tracking for grace period')
|
|
49
|
+
}
|
|
50
|
+
})
|
|
51
|
+
|
|
36
52
|
const processHealthInfo = async (healthInfo) => {
|
|
37
53
|
if (!healthInfo) {
|
|
38
54
|
app.log.error('No health info received')
|
|
@@ -55,6 +71,14 @@ async function alerts (app, _opts) {
|
|
|
55
71
|
healthCache.splice(0, validIndex)
|
|
56
72
|
}
|
|
57
73
|
|
|
74
|
+
// Skip sending alerts during worker's grace period.
|
|
75
|
+
// Use plugin start time as default for workers that started before the listener.
|
|
76
|
+
const workerStartTime = workerStartTimes.get(workerId) ?? pluginStartTime
|
|
77
|
+
if (timestamp - workerStartTime < gracePeriodMs) {
|
|
78
|
+
app.log.debug({ workerId }, 'Skipping alert during worker grace period')
|
|
79
|
+
return
|
|
80
|
+
}
|
|
81
|
+
|
|
58
82
|
// healthInfo is an object with the following structure:
|
|
59
83
|
// id: "service-1"
|
|
60
84
|
// service: "service-1"
|
|
@@ -112,7 +136,7 @@ async function alerts (app, _opts) {
|
|
|
112
136
|
|
|
113
137
|
const alert = await body.json()
|
|
114
138
|
|
|
115
|
-
app.
|
|
139
|
+
app.sendFlamegraphs({
|
|
116
140
|
workerIds: [workerId],
|
|
117
141
|
alertId: alert.id
|
|
118
142
|
}).catch(err => {
|
package/plugins/env.js
CHANGED
|
@@ -26,7 +26,8 @@ const schema = {
|
|
|
26
26
|
PLT_JWT_EXPIRATION_OFFSET_SEC: { type: 'number', default: 60 },
|
|
27
27
|
PLT_UPDATES_RECONNECT_INTERVAL_SEC: { type: 'number', default: 1 },
|
|
28
28
|
PLT_ELU_HEALTH_SIGNAL_THRESHOLD: { type: 'number', default: 0.8 },
|
|
29
|
-
PLT_HEAP_HEALTH_SIGNAL_THRESHOLD: { type: ['number', 'string'], default: '4GB' }
|
|
29
|
+
PLT_HEAP_HEALTH_SIGNAL_THRESHOLD: { type: ['number', 'string'], default: '4GB' },
|
|
30
|
+
PLT_ALERTS_GRACE_PERIOD_SEC: { type: 'number', default: 30 }
|
|
30
31
|
}
|
|
31
32
|
}
|
|
32
33
|
|
package/plugins/flamegraphs.js
CHANGED
|
@@ -1,207 +1,147 @@
|
|
|
1
1
|
'use strict'
|
|
2
2
|
|
|
3
|
+
import { setTimeout as sleep } from 'node:timers/promises'
|
|
3
4
|
import { request } from 'undici'
|
|
4
5
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
#requests
|
|
13
|
-
#isProfiling
|
|
14
|
-
#onProfile
|
|
15
|
-
#getProfileInterval
|
|
16
|
-
#stopProfileTimeout
|
|
17
|
-
|
|
18
|
-
constructor (options = {}) {
|
|
19
|
-
const { type, duration, workerId, sourceMaps, app, onProfile } = options
|
|
20
|
-
|
|
21
|
-
if (type !== 'cpu' && type !== 'heap') {
|
|
22
|
-
throw new Error('Invalid Profiler type. Must be either "cpu" or "heap"')
|
|
23
|
-
}
|
|
24
|
-
if (typeof duration !== 'number') {
|
|
25
|
-
throw new Error('Invalid Profiler duration. Must be a number')
|
|
26
|
-
}
|
|
27
|
-
if (typeof workerId !== 'string') {
|
|
28
|
-
throw new Error('Invalid Worker ID. Must be a string')
|
|
29
|
-
}
|
|
30
|
-
if (!workerId.includes(':')) {
|
|
31
|
-
throw new Error('Worker ID must include the service ID and worker index')
|
|
32
|
-
}
|
|
33
|
-
if (typeof onProfile !== 'function') {
|
|
34
|
-
throw new Error('Invalid onProfile handler. Must be a function')
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
this.#type = type
|
|
38
|
-
this.#duration = duration
|
|
39
|
-
this.#workerId = workerId
|
|
40
|
-
this.#onProfile = onProfile
|
|
6
|
+
async function flamegraphs (app, _opts) {
|
|
7
|
+
const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
|
|
8
|
+
const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
|
|
9
|
+
const flamegraphsELUThreshold = app.env.PLT_FLAMEGRAPHS_ELU_THRESHOLD
|
|
10
|
+
const flamegraphsGracePeriod = app.env.PLT_FLAMEGRAPHS_GRACE_PERIOD
|
|
11
|
+
const flamegraphsAttemptTimeout = app.env.PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT
|
|
12
|
+
const flamegraphsCacheCleanupInterval = app.env.PLT_FLAMEGRAPHS_CACHE_CLEANUP_INTERVAL
|
|
41
13
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
14
|
+
const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
|
|
15
|
+
const eluThreshold = parseFloat(flamegraphsELUThreshold)
|
|
16
|
+
const gracePeriod = parseInt(flamegraphsGracePeriod)
|
|
17
|
+
const attemptTimeout = Math.min(parseInt(flamegraphsAttemptTimeout), durationMillis)
|
|
18
|
+
const maxAttempts = Math.ceil(durationMillis / attemptTimeout) + 1
|
|
19
|
+
const cacheCleanupInterval = parseInt(flamegraphsCacheCleanupInterval)
|
|
47
20
|
|
|
48
|
-
|
|
49
|
-
this.#isProfiling = false
|
|
21
|
+
let workerStartedListener = null
|
|
50
22
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
workerId: this.#workerId,
|
|
54
|
-
profilerType: this.#type
|
|
55
|
-
})
|
|
56
|
-
}
|
|
23
|
+
const startProfilingOnWorker = async (runtime, workerFullId, logContext = {}) => {
|
|
24
|
+
await sleep(gracePeriod)
|
|
57
25
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
this.#requests.push(request)
|
|
62
|
-
this.#unscheduleStopProfiling()
|
|
26
|
+
// Get application details to read service-level sourceMaps setting
|
|
27
|
+
const appDetails = await runtime.getApplicationDetails(workerFullId)
|
|
28
|
+
const sourceMaps = appDetails.sourceMaps ?? false
|
|
63
29
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
30
|
+
try {
|
|
31
|
+
// Start CPU profiling
|
|
32
|
+
await runtime.sendCommandToApplication(
|
|
33
|
+
workerFullId,
|
|
34
|
+
'startProfiling',
|
|
35
|
+
{ durationMillis, eluThreshold, type: 'cpu', sourceMaps }
|
|
36
|
+
)
|
|
68
37
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
if (this.#isProfiling) {
|
|
79
|
-
await this.#stopProfiling()
|
|
38
|
+
// Start HEAP profiling
|
|
39
|
+
await runtime.sendCommandToApplication(
|
|
40
|
+
workerFullId,
|
|
41
|
+
'startProfiling',
|
|
42
|
+
{ durationMillis, eluThreshold, type: 'heap', sourceMaps }
|
|
43
|
+
)
|
|
44
|
+
} catch (err) {
|
|
45
|
+
app.log.error({ err, ...logContext }, 'Failed to start profiling')
|
|
46
|
+
throw err
|
|
80
47
|
}
|
|
81
48
|
}
|
|
82
49
|
|
|
83
|
-
async
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
} catch (err) {
|
|
87
|
-
this.#log.error({ err }, 'Failed to start profiling')
|
|
88
|
-
const requests = this.#getProfileRequests(Date.now())
|
|
89
|
-
this.#onProfile(err, null, requests)
|
|
50
|
+
app.setupFlamegraphs = async () => {
|
|
51
|
+
if (isFlamegraphsDisabled) {
|
|
52
|
+
app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
|
|
90
53
|
return
|
|
91
54
|
}
|
|
92
55
|
|
|
93
|
-
|
|
94
|
-
() => this.#processProfile(),
|
|
95
|
-
this.#duration
|
|
96
|
-
).unref()
|
|
97
|
-
}
|
|
56
|
+
app.log.info('Start profiling services')
|
|
98
57
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
const profile = await this.#getProfile()
|
|
102
|
-
const requests = this.#getProfileRequests(profile.timestamp)
|
|
103
|
-
this.#onProfile(null, profile, requests)
|
|
104
|
-
} catch (err) {
|
|
105
|
-
this.#log.error({ err }, 'Failed to generate a profile')
|
|
106
|
-
const requests = this.#getProfileRequests(Date.now())
|
|
107
|
-
this.#onProfile(err, null, requests)
|
|
108
|
-
}
|
|
58
|
+
const runtime = app.watt.runtime
|
|
59
|
+
const workers = await runtime.getWorkers()
|
|
109
60
|
|
|
110
|
-
|
|
111
|
-
|
|
61
|
+
const promises = []
|
|
62
|
+
for (const [workerFullId, workerInfo] of Object.entries(workers)) {
|
|
63
|
+
if (workerInfo.status === 'started') {
|
|
64
|
+
const promise = startProfilingOnWorker(runtime, workerFullId, { workerFullId })
|
|
65
|
+
promises.push(promise)
|
|
66
|
+
}
|
|
112
67
|
}
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
#scheduleStopProfiling () {
|
|
116
|
-
// Stop profiling after the duration/2 if there are no more requests
|
|
117
|
-
this.#stopProfileTimeout = setTimeout(
|
|
118
|
-
() => this.stop(),
|
|
119
|
-
this.#duration / 2
|
|
120
|
-
).unref()
|
|
121
|
-
}
|
|
122
68
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
69
|
+
const results = await Promise.allSettled(promises)
|
|
70
|
+
for (const result of results) {
|
|
71
|
+
if (result.status === 'rejected') {
|
|
72
|
+
app.log.error({ result }, 'Failed to start profiling')
|
|
73
|
+
}
|
|
127
74
|
}
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
async #startProfiling () {
|
|
131
|
-
this.#isProfiling = true
|
|
132
|
-
this.#log.info('Starting profiling')
|
|
133
75
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
76
|
+
// Listen for new workers starting and start profiling on them
|
|
77
|
+
workerStartedListener = ({ application, worker }) => {
|
|
78
|
+
if (isFlamegraphsDisabled) {
|
|
79
|
+
return
|
|
80
|
+
}
|
|
138
81
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
this.#log.info('Stopping profiling')
|
|
82
|
+
const workerFullId = [application, worker].join(':')
|
|
83
|
+
app.log.info({ application, worker }, 'Starting profiling on new worker')
|
|
142
84
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
)
|
|
147
|
-
} catch (err) {
|
|
148
|
-
// Ignore errors if the app is already closing
|
|
149
|
-
this.#log.debug({ err }, 'Failed to stop profiling')
|
|
85
|
+
startProfilingOnWorker(runtime, workerFullId, { application, worker }).catch(() => {
|
|
86
|
+
// Error already logged in startProfilingOnWorker
|
|
87
|
+
})
|
|
150
88
|
}
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
async #getProfile () {
|
|
154
|
-
this.#log.info('Getting profile from worker')
|
|
155
|
-
|
|
156
|
-
const [state, profile] = await Promise.all([
|
|
157
|
-
this.#runtime.sendCommandToApplication(this.#workerId, 'getProfilingState', { type: this.#type }),
|
|
158
|
-
this.#runtime.sendCommandToApplication(this.#workerId, 'getLastProfile', { type: this.#type })
|
|
159
|
-
])
|
|
160
|
-
return { data: profile, timestamp: state.latestProfileTimestamp }
|
|
161
|
-
}
|
|
89
|
+
runtime.on('application:worker:started', workerStartedListener)
|
|
162
90
|
|
|
163
|
-
|
|
164
|
-
let processedIndex = 0
|
|
165
|
-
for (let i = 0; i < this.#requests.length; i++) {
|
|
166
|
-
if (this.#requests[i].timestamp <= profileTimestamp) {
|
|
167
|
-
processedIndex = i + 1
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
return this.#requests.splice(0, processedIndex)
|
|
91
|
+
setInterval(cleanupFlamegraphsCache, cacheCleanupInterval).unref()
|
|
171
92
|
}
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
async function flamegraphs (app, _opts) {
|
|
175
|
-
const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
|
|
176
|
-
const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
|
|
177
|
-
|
|
178
|
-
const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
|
|
179
|
-
|
|
180
|
-
const profilers = {}
|
|
181
|
-
const profilersConfigs = {}
|
|
182
93
|
|
|
183
|
-
app.
|
|
184
|
-
if (
|
|
185
|
-
app.
|
|
186
|
-
|
|
94
|
+
app.cleanupFlamegraphs = async () => {
|
|
95
|
+
if (workerStartedListener && app.watt?.runtime) {
|
|
96
|
+
app.watt.runtime.removeListener('application:worker:started', workerStartedListener)
|
|
97
|
+
workerStartedListener = null
|
|
187
98
|
}
|
|
188
99
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
100
|
+
// Explicitly stop all active profiling sessions to avoid memory corruption
|
|
101
|
+
if (!isFlamegraphsDisabled && app.watt?.runtime) {
|
|
102
|
+
try {
|
|
103
|
+
const workers = await app.watt.runtime.getWorkers()
|
|
104
|
+
const stopPromises = []
|
|
105
|
+
for (const workerFullId of Object.keys(workers)) {
|
|
106
|
+
// Stop both CPU and heap profiling on each worker
|
|
107
|
+
stopPromises.push(
|
|
108
|
+
app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'cpu' })
|
|
109
|
+
.catch(err => {
|
|
110
|
+
// Ignore errors if profiling wasn't running
|
|
111
|
+
if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
|
|
112
|
+
app.log.warn({ err, workerFullId }, 'Failed to stop CPU profiling')
|
|
113
|
+
}
|
|
114
|
+
})
|
|
115
|
+
)
|
|
116
|
+
stopPromises.push(
|
|
117
|
+
app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'heap' })
|
|
118
|
+
.catch(err => {
|
|
119
|
+
// Ignore errors if profiling wasn't running
|
|
120
|
+
if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
|
|
121
|
+
app.log.warn({ err, workerFullId }, 'Failed to stop heap profiling')
|
|
122
|
+
}
|
|
123
|
+
})
|
|
124
|
+
)
|
|
125
|
+
}
|
|
126
|
+
await Promise.all(stopPromises)
|
|
127
|
+
// Small delay to ensure native cleanup completes
|
|
128
|
+
await sleep(100)
|
|
129
|
+
} catch (err) {
|
|
130
|
+
app.log.warn({ err }, 'Failed to stop profiling during cleanup')
|
|
131
|
+
}
|
|
196
132
|
}
|
|
197
133
|
}
|
|
198
134
|
|
|
199
|
-
|
|
135
|
+
const profilesByWorkerId = {}
|
|
136
|
+
|
|
137
|
+
app.sendFlamegraphs = async (options = {}) => {
|
|
200
138
|
if (isFlamegraphsDisabled) {
|
|
201
139
|
app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
|
|
202
140
|
return
|
|
203
141
|
}
|
|
204
142
|
|
|
143
|
+
let { workerIds, alertId, profileType = 'cpu' } = options
|
|
144
|
+
|
|
205
145
|
const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
|
|
206
146
|
if (!scalerUrl) {
|
|
207
147
|
app.log.error('No scaler URL found in ICC services, cannot send flamegraph')
|
|
@@ -210,91 +150,118 @@ async function flamegraphs (app, _opts) {
|
|
|
210
150
|
|
|
211
151
|
const runtime = app.watt.runtime
|
|
212
152
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
const workers = await runtime.getWorkers()
|
|
217
|
-
for (const workerId in workers) {
|
|
218
|
-
const workerInfo = workers[workerId]
|
|
219
|
-
const serviceId = workerInfo.application
|
|
220
|
-
|
|
221
|
-
servicesWorkers[serviceId] ??= []
|
|
222
|
-
servicesWorkers[serviceId].push(workerId)
|
|
153
|
+
if (!workerIds) {
|
|
154
|
+
const { applications } = await runtime.getApplications()
|
|
155
|
+
workerIds = applications.map(app => app.id)
|
|
223
156
|
}
|
|
224
157
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
for (let workerId of workerIds) {
|
|
228
|
-
const [serviceId, workerIndex] = workerId.split(':')
|
|
229
|
-
if (workerIndex === undefined) {
|
|
230
|
-
workerId = servicesWorkers[serviceId][0]
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
if (workerId === undefined) {
|
|
234
|
-
app.log.error({ serviceId }, 'No worker found for an application')
|
|
235
|
-
continue
|
|
236
|
-
}
|
|
158
|
+
cleanupFlamegraphsCache()
|
|
237
159
|
|
|
160
|
+
const uploadPromises = workerIds.map(async (workerId) => {
|
|
161
|
+
const serviceId = workerId.split(':')[0]
|
|
238
162
|
const profileKey = `${workerId}:${profileType}`
|
|
239
163
|
|
|
240
|
-
let
|
|
241
|
-
if (
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
164
|
+
let profile = profilesByWorkerId[profileKey]
|
|
165
|
+
if (profile !== undefined) {
|
|
166
|
+
if (alertId) {
|
|
167
|
+
app.log.info(
|
|
168
|
+
{ workerId, alertId }, 'Flamegraph will be attached to the alert'
|
|
169
|
+
)
|
|
170
|
+
profile.waitingAlerts.push(alertId)
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
if (profile.flamegraphId === null) {
|
|
174
|
+
app.log.info({ workerId }, 'Waiting for flamegraph to be generated and sent')
|
|
175
|
+
return
|
|
176
|
+
}
|
|
252
177
|
}
|
|
253
178
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
179
|
+
if (profile === undefined) {
|
|
180
|
+
profile = {
|
|
181
|
+
type: profileType,
|
|
182
|
+
data: null,
|
|
183
|
+
timestamp: null,
|
|
184
|
+
flamegraphId: null,
|
|
185
|
+
waitingAlerts: []
|
|
186
|
+
}
|
|
187
|
+
profilesByWorkerId[profileKey] = profile
|
|
257
188
|
|
|
258
|
-
|
|
259
|
-
|
|
189
|
+
const result = await getServiceFlamegraph(workerId, profileType)
|
|
190
|
+
if (!result || !(result.data instanceof Uint8Array)) {
|
|
191
|
+
app.log.error({ workerId }, 'Failed to get profile from service')
|
|
192
|
+
delete profilesByWorkerId[profileKey]
|
|
193
|
+
return
|
|
194
|
+
}
|
|
260
195
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
app.log.error({ err }, 'Failed to generate a profile')
|
|
264
|
-
return
|
|
196
|
+
profile.data = result.data
|
|
197
|
+
profile.timestamp = result.timestamp
|
|
265
198
|
}
|
|
266
199
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
200
|
+
if (profile.flamegraphId === null || !alertId) {
|
|
201
|
+
try {
|
|
202
|
+
const flamegraph = await sendServiceFlamegraph(
|
|
203
|
+
scalerUrl,
|
|
204
|
+
serviceId,
|
|
205
|
+
profile.data,
|
|
206
|
+
profileType,
|
|
207
|
+
alertId
|
|
208
|
+
)
|
|
209
|
+
profile.flamegraphId = flamegraph.id
|
|
210
|
+
} catch (err) {
|
|
211
|
+
app.log.error({ err, workerId, alertId, profileType }, 'Failed to send flamegraph')
|
|
212
|
+
delete profilesByWorkerId[profileKey]
|
|
213
|
+
return
|
|
271
214
|
}
|
|
272
215
|
}
|
|
273
216
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
const flamegraph = await sendServiceFlamegraph(
|
|
217
|
+
const waitingAlerts = profile.waitingAlerts
|
|
218
|
+
if (waitingAlerts.length > 0) {
|
|
219
|
+
profile.waitingAlerts = []
|
|
220
|
+
await _attachFlamegraphToAlerts(
|
|
279
221
|
scalerUrl,
|
|
280
222
|
serviceId,
|
|
223
|
+
profile.flamegraphId,
|
|
281
224
|
profile.data,
|
|
282
|
-
|
|
283
|
-
|
|
225
|
+
profile.type,
|
|
226
|
+
waitingAlerts
|
|
284
227
|
)
|
|
228
|
+
}
|
|
229
|
+
})
|
|
285
230
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
231
|
+
await Promise.all(uploadPromises)
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
async function getServiceFlamegraph (workerId, profileType, attempt = 1) {
|
|
235
|
+
const runtime = app.watt.runtime
|
|
236
|
+
|
|
237
|
+
app.log.info({ workerId, attempt, maxAttempts, attemptTimeout }, 'Getting profile from worker')
|
|
238
|
+
|
|
239
|
+
try {
|
|
240
|
+
const [state, profile] = await Promise.all([
|
|
241
|
+
runtime.sendCommandToApplication(workerId, 'getProfilingState', { type: profileType }),
|
|
242
|
+
runtime.sendCommandToApplication(workerId, 'getLastProfile', { type: profileType })
|
|
243
|
+
])
|
|
244
|
+
return { data: profile, timestamp: state.latestProfileTimestamp }
|
|
245
|
+
} catch (err) {
|
|
246
|
+
if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
|
|
247
|
+
app.log.info(
|
|
248
|
+
{ workerId, attempt, maxAttempts, attemptTimeout },
|
|
249
|
+
'No profile available for the service. Waiting for profiling to complete.'
|
|
250
|
+
)
|
|
251
|
+
if (attempt <= maxAttempts) {
|
|
252
|
+
await sleep(attemptTimeout)
|
|
253
|
+
return getServiceFlamegraph(workerId, profileType, attempt + 1)
|
|
254
|
+
}
|
|
255
|
+
} else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
|
|
256
|
+
app.log.info({ workerId }, 'ELU low, CPU profiling not active')
|
|
257
|
+
} else {
|
|
258
|
+
app.log.warn({ err, workerId }, 'Failed to get profile from a worker')
|
|
259
|
+
|
|
260
|
+
const [serviceId, workerIndex] = workerId.split(':')
|
|
261
|
+
if (workerIndex) {
|
|
262
|
+
app.log.warn('Worker not available, trying to get profile from another worker')
|
|
263
|
+
return getServiceFlamegraph(serviceId, profileType)
|
|
295
264
|
}
|
|
296
|
-
} catch (err) {
|
|
297
|
-
app.log.error({ err, workerId }, 'Failed to send flamegraph')
|
|
298
265
|
}
|
|
299
266
|
}
|
|
300
267
|
}
|
|
@@ -302,7 +269,7 @@ async function flamegraphs (app, _opts) {
|
|
|
302
269
|
async function sendServiceFlamegraph (scalerUrl, serviceId, profile, profileType, alertId) {
|
|
303
270
|
const podId = app.instanceId
|
|
304
271
|
const url = `${scalerUrl}/pods/${podId}/services/${serviceId}/flamegraph`
|
|
305
|
-
app.log.info({ serviceId, podId, profileType
|
|
272
|
+
app.log.info({ serviceId, podId, profileType }, 'Sending flamegraph')
|
|
306
273
|
|
|
307
274
|
const query = { profileType }
|
|
308
275
|
if (alertId) {
|
|
@@ -326,14 +293,8 @@ async function flamegraphs (app, _opts) {
|
|
|
326
293
|
throw new Error(`Failed to send flamegraph: ${error}`)
|
|
327
294
|
}
|
|
328
295
|
|
|
329
|
-
const
|
|
330
|
-
|
|
331
|
-
app.log.info(
|
|
332
|
-
{ serviceId, podId, profileType, flamegraph },
|
|
333
|
-
'Flamegraph successfully stored'
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
return flamegraph
|
|
296
|
+
const response = await body.json()
|
|
297
|
+
return response
|
|
337
298
|
}
|
|
338
299
|
|
|
339
300
|
// Function that supports ICC that doesn't have attach flamegraph API
|
|
@@ -406,10 +367,15 @@ async function flamegraphs (app, _opts) {
|
|
|
406
367
|
}
|
|
407
368
|
}
|
|
408
369
|
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
370
|
+
function cleanupFlamegraphsCache () {
|
|
371
|
+
const now = Date.now()
|
|
372
|
+
|
|
373
|
+
for (const profileKey of Object.keys(profilesByWorkerId)) {
|
|
374
|
+
const timestamp = profilesByWorkerId[profileKey]?.timestamp
|
|
375
|
+
if (timestamp && now - timestamp > durationMillis) {
|
|
376
|
+
delete profilesByWorkerId[profileKey]
|
|
377
|
+
}
|
|
378
|
+
}
|
|
413
379
|
}
|
|
414
380
|
}
|
|
415
381
|
|
|
@@ -183,14 +183,12 @@ async function healthSignals (app, _opts) {
|
|
|
183
183
|
app.log.error({ error }, 'Failed to send health signals to scaler')
|
|
184
184
|
}
|
|
185
185
|
|
|
186
|
-
const
|
|
186
|
+
const alert = await body.json()
|
|
187
187
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
app.requestFlamegraphs({
|
|
188
|
+
app.sendFlamegraphs({
|
|
191
189
|
serviceIds: [serviceId],
|
|
192
190
|
workerIds: [workerId],
|
|
193
|
-
alertId:
|
|
191
|
+
alertId: alert.id
|
|
194
192
|
}).catch(err => {
|
|
195
193
|
app.log.error({ err }, 'Failed to send a flamegraph')
|
|
196
194
|
})
|