@platformatic/watt-extra 1.7.0 → 1.7.1-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +5 -8
- package/package.json +1 -1
- package/plugins/alerts.js +2 -7
- package/plugins/flamegraphs.js +274 -205
- package/plugins/health-signals.js +4 -9
- package/plugins/update.js +2 -2
- package/test/alerts.test.js +7 -17
- package/test/health-signals.test.js +3 -6
- package/test/profiler.test.js +443 -0
- package/test/trigger-flamegraphs.test.js +257 -414
|
@@ -1,14 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"permissions": {
|
|
3
3
|
"allow": [
|
|
4
|
-
"
|
|
5
|
-
"Bash(
|
|
6
|
-
"Bash(
|
|
7
|
-
"Bash(
|
|
8
|
-
"Bash(
|
|
9
|
-
"Bash(find:*)",
|
|
10
|
-
"Bash(cat:*)",
|
|
11
|
-
"WebFetch(domain:github.com)"
|
|
4
|
+
"Bash(node --test-only:*)",
|
|
5
|
+
"Bash(node --test:*)",
|
|
6
|
+
"Bash(for i in {1..3})",
|
|
7
|
+
"Bash(do echo \"=== Run $i ===\")",
|
|
8
|
+
"Bash(done)"
|
|
12
9
|
],
|
|
13
10
|
"deny": [],
|
|
14
11
|
"ask": []
|
package/package.json
CHANGED
package/plugins/alerts.js
CHANGED
|
@@ -56,7 +56,6 @@ async function alerts (app, _opts) {
|
|
|
56
56
|
}
|
|
57
57
|
|
|
58
58
|
const timestamp = Date.now()
|
|
59
|
-
const workerId = healthInfo.id
|
|
60
59
|
const serviceId = healthInfo.application
|
|
61
60
|
const healthWithTimestamp = { ...healthInfo, timestamp, service: serviceId }
|
|
62
61
|
delete healthWithTimestamp.healthConfig // we don't need to store this
|
|
@@ -136,12 +135,8 @@ async function alerts (app, _opts) {
|
|
|
136
135
|
|
|
137
136
|
const alert = await body.json()
|
|
138
137
|
|
|
139
|
-
app.
|
|
140
|
-
|
|
141
|
-
alertId: alert.id
|
|
142
|
-
}).catch(err => {
|
|
143
|
-
app.log.error({ err }, 'Failed to send a flamegraph')
|
|
144
|
-
})
|
|
138
|
+
app.requestFlamegraphs({ serviceIds: [serviceId], alertId: alert.id })
|
|
139
|
+
.catch(err => app.log.error({ err }, 'Failed to send a flamegraph'))
|
|
145
140
|
}
|
|
146
141
|
}
|
|
147
142
|
|
package/plugins/flamegraphs.js
CHANGED
|
@@ -1,146 +1,231 @@
|
|
|
1
1
|
'use strict'
|
|
2
2
|
|
|
3
|
-
import { setTimeout as sleep } from 'node:timers/promises'
|
|
4
3
|
import { request } from 'undici'
|
|
5
4
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
5
|
+
export class Profiler {
|
|
6
|
+
#workerId
|
|
7
|
+
#type
|
|
8
|
+
#duration
|
|
9
|
+
#profileOptions
|
|
10
|
+
#runtime
|
|
11
|
+
#log
|
|
12
|
+
#requests
|
|
13
|
+
#isProfiling
|
|
14
|
+
#onProfile
|
|
15
|
+
#getProfileInterval
|
|
16
|
+
#stopProfileTimeout
|
|
17
|
+
|
|
18
|
+
constructor (options = {}) {
|
|
19
|
+
const { type, duration, workerId, sourceMaps, app, onProfile } = options
|
|
20
|
+
|
|
21
|
+
if (type !== 'cpu' && type !== 'heap') {
|
|
22
|
+
throw new Error('Invalid Profiler type. Must be either "cpu" or "heap"')
|
|
23
|
+
}
|
|
24
|
+
if (typeof duration !== 'number') {
|
|
25
|
+
throw new Error('Invalid Profiler duration. Must be a number')
|
|
26
|
+
}
|
|
27
|
+
if (typeof workerId !== 'string') {
|
|
28
|
+
throw new Error('Invalid Worker ID. Must be a string')
|
|
29
|
+
}
|
|
30
|
+
if (!workerId.includes(':')) {
|
|
31
|
+
throw new Error('Worker ID must include the service ID and worker index')
|
|
32
|
+
}
|
|
33
|
+
if (typeof onProfile !== 'function') {
|
|
34
|
+
throw new Error('Invalid onProfile handler. Must be a function')
|
|
35
|
+
}
|
|
13
36
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
const maxAttempts = Math.ceil(durationMillis / attemptTimeout) + 1
|
|
19
|
-
const cacheCleanupInterval = parseInt(flamegraphsCacheCleanupInterval)
|
|
37
|
+
this.#type = type
|
|
38
|
+
this.#duration = duration
|
|
39
|
+
this.#workerId = workerId
|
|
40
|
+
this.#onProfile = onProfile
|
|
20
41
|
|
|
21
|
-
|
|
42
|
+
this.#profileOptions = {
|
|
43
|
+
type,
|
|
44
|
+
durationMillis: duration,
|
|
45
|
+
sourceMaps: sourceMaps ?? false
|
|
46
|
+
}
|
|
22
47
|
|
|
23
|
-
|
|
24
|
-
|
|
48
|
+
this.#requests = []
|
|
49
|
+
this.#isProfiling = false
|
|
25
50
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
51
|
+
this.#runtime = app.watt.runtime
|
|
52
|
+
this.#log = app.log.child({
|
|
53
|
+
workerId: this.#workerId,
|
|
54
|
+
profilerType: this.#type
|
|
55
|
+
})
|
|
56
|
+
}
|
|
29
57
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
workerFullId,
|
|
34
|
-
'startProfiling',
|
|
35
|
-
{ durationMillis, eluThreshold, type: 'cpu', sourceMaps }
|
|
36
|
-
)
|
|
58
|
+
get workerId () {
|
|
59
|
+
return this.#workerId
|
|
60
|
+
}
|
|
37
61
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
62
|
+
get isProfiling () {
|
|
63
|
+
return this.#isProfiling
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
async requestProfile (request = {}) {
|
|
67
|
+
request.timestamp ??= Date.now()
|
|
68
|
+
this.#requests.push(request)
|
|
69
|
+
this.#unscheduleStopProfiling()
|
|
70
|
+
|
|
71
|
+
if (!this.#isProfiling) {
|
|
72
|
+
this.#startProfilingLoop()
|
|
47
73
|
}
|
|
48
74
|
}
|
|
49
75
|
|
|
50
|
-
|
|
51
|
-
if (
|
|
52
|
-
|
|
76
|
+
async stop () {
|
|
77
|
+
if (this.#getProfileInterval) {
|
|
78
|
+
clearInterval(this.#getProfileInterval)
|
|
79
|
+
this.#getProfileInterval = null
|
|
80
|
+
}
|
|
81
|
+
if (this.#stopProfileTimeout) {
|
|
82
|
+
clearTimeout(this.#stopProfileTimeout)
|
|
83
|
+
this.#stopProfileTimeout = null
|
|
84
|
+
}
|
|
85
|
+
if (this.#isProfiling) {
|
|
86
|
+
const requests = this.#getProfileRequests()
|
|
87
|
+
try {
|
|
88
|
+
const profile = await this.#stopProfiling()
|
|
89
|
+
if (requests.length > 0) {
|
|
90
|
+
this.#onProfile(null, profile, requests)
|
|
91
|
+
}
|
|
92
|
+
} catch (err) {
|
|
93
|
+
this.#log.error({ err }, 'Failed to stop profiling')
|
|
94
|
+
if (requests.length > 0) {
|
|
95
|
+
this.#onProfile(err, null, requests)
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
async #startProfilingLoop () {
|
|
102
|
+
try {
|
|
103
|
+
await this.#startProfiling()
|
|
104
|
+
} catch (err) {
|
|
105
|
+
this.#log.error({ err }, 'Failed to start profiling')
|
|
106
|
+
const requests = this.#getProfileRequests()
|
|
107
|
+
this.#onProfile(err, null, requests)
|
|
53
108
|
return
|
|
54
109
|
}
|
|
55
110
|
|
|
56
|
-
|
|
111
|
+
this.#getProfileInterval = setInterval(
|
|
112
|
+
() => this.#processProfile(),
|
|
113
|
+
this.#duration
|
|
114
|
+
).unref()
|
|
115
|
+
}
|
|
57
116
|
|
|
58
|
-
|
|
59
|
-
|
|
117
|
+
async #processProfile () {
|
|
118
|
+
try {
|
|
119
|
+
const profile = await this.#getProfile()
|
|
120
|
+
const requests = this.#getProfileRequests(profile.timestamp)
|
|
121
|
+
this.#onProfile(null, profile, requests)
|
|
122
|
+
} catch (err) {
|
|
123
|
+
this.#log.error({ err }, 'Failed to generate a profile')
|
|
124
|
+
const requests = this.#getProfileRequests()
|
|
125
|
+
this.#onProfile(err, null, requests)
|
|
126
|
+
}
|
|
60
127
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
if (workerInfo.status === 'started') {
|
|
64
|
-
const promise = startProfilingOnWorker(runtime, workerFullId, { workerFullId })
|
|
65
|
-
promises.push(promise)
|
|
66
|
-
}
|
|
128
|
+
if (this.#requests.length === 0) {
|
|
129
|
+
this.#scheduleStopProfiling()
|
|
67
130
|
}
|
|
131
|
+
}
|
|
68
132
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
133
|
+
#scheduleStopProfiling () {
|
|
134
|
+
// Stop profiling after the duration/2 if there are no more requests
|
|
135
|
+
this.#stopProfileTimeout = setTimeout(
|
|
136
|
+
() => this.stop(),
|
|
137
|
+
this.#duration / 2
|
|
138
|
+
).unref()
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
#unscheduleStopProfiling () {
|
|
142
|
+
if (this.#stopProfileTimeout) {
|
|
143
|
+
clearTimeout(this.#stopProfileTimeout)
|
|
144
|
+
this.#stopProfileTimeout = null
|
|
74
145
|
}
|
|
146
|
+
}
|
|
75
147
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
return
|
|
80
|
-
}
|
|
148
|
+
async #startProfiling () {
|
|
149
|
+
this.#isProfiling = true
|
|
150
|
+
this.#log.info('Starting profiling')
|
|
81
151
|
|
|
82
|
-
|
|
83
|
-
|
|
152
|
+
await this.#runtime.sendCommandToApplication(
|
|
153
|
+
this.#workerId, 'startProfiling', this.#profileOptions
|
|
154
|
+
)
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
async #stopProfiling () {
|
|
158
|
+
this.#isProfiling = false
|
|
159
|
+
this.#log.info('Stopping profiling')
|
|
84
160
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
161
|
+
try {
|
|
162
|
+
const profile = await this.#runtime.sendCommandToApplication(
|
|
163
|
+
this.#workerId, 'stopProfiling', this.#profileOptions
|
|
164
|
+
)
|
|
165
|
+
return profile
|
|
166
|
+
} catch (err) {
|
|
167
|
+
// Ignore errors if the app is already closing
|
|
168
|
+
this.#log.debug({ err }, 'Failed to stop profiling')
|
|
88
169
|
}
|
|
89
|
-
|
|
170
|
+
}
|
|
90
171
|
|
|
91
|
-
|
|
172
|
+
async #getProfile () {
|
|
173
|
+
this.#log.info('Getting profile from worker')
|
|
174
|
+
|
|
175
|
+
const [state, profile] = await Promise.all([
|
|
176
|
+
this.#runtime.sendCommandToApplication(this.#workerId, 'getProfilingState', { type: this.#type }),
|
|
177
|
+
this.#runtime.sendCommandToApplication(this.#workerId, 'getLastProfile', { type: this.#type })
|
|
178
|
+
])
|
|
179
|
+
return { data: profile, timestamp: state.latestProfileTimestamp }
|
|
92
180
|
}
|
|
93
181
|
|
|
94
|
-
|
|
95
|
-
if (
|
|
96
|
-
|
|
97
|
-
|
|
182
|
+
#getProfileRequests (profileTimestamp) {
|
|
183
|
+
if (profileTimestamp === undefined) {
|
|
184
|
+
const requests = this.#requests
|
|
185
|
+
this.#requests = []
|
|
186
|
+
return requests
|
|
98
187
|
}
|
|
99
188
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
const stopPromises = []
|
|
105
|
-
for (const workerFullId of Object.keys(workers)) {
|
|
106
|
-
// Stop both CPU and heap profiling on each worker
|
|
107
|
-
stopPromises.push(
|
|
108
|
-
app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'cpu' })
|
|
109
|
-
.catch(err => {
|
|
110
|
-
// Ignore errors if profiling wasn't running
|
|
111
|
-
if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
|
|
112
|
-
app.log.warn({ err, workerFullId }, 'Failed to stop CPU profiling')
|
|
113
|
-
}
|
|
114
|
-
})
|
|
115
|
-
)
|
|
116
|
-
stopPromises.push(
|
|
117
|
-
app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'heap' })
|
|
118
|
-
.catch(err => {
|
|
119
|
-
// Ignore errors if profiling wasn't running
|
|
120
|
-
if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
|
|
121
|
-
app.log.warn({ err, workerFullId }, 'Failed to stop heap profiling')
|
|
122
|
-
}
|
|
123
|
-
})
|
|
124
|
-
)
|
|
125
|
-
}
|
|
126
|
-
await Promise.all(stopPromises)
|
|
127
|
-
// Small delay to ensure native cleanup completes
|
|
128
|
-
await sleep(100)
|
|
129
|
-
} catch (err) {
|
|
130
|
-
app.log.warn({ err }, 'Failed to stop profiling during cleanup')
|
|
189
|
+
let processedIndex = 0
|
|
190
|
+
for (let i = 0; i < this.#requests.length; i++) {
|
|
191
|
+
if (this.#requests[i].timestamp <= profileTimestamp) {
|
|
192
|
+
processedIndex = i + 1
|
|
131
193
|
}
|
|
132
194
|
}
|
|
195
|
+
return this.#requests.splice(0, processedIndex)
|
|
133
196
|
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
async function flamegraphs (app, _opts) {
|
|
200
|
+
const isFlamegraphsDisabled = app.env.PLT_DISABLE_FLAMEGRAPHS
|
|
201
|
+
const flamegraphsIntervalSec = app.env.PLT_FLAMEGRAPHS_INTERVAL_SEC
|
|
202
|
+
|
|
203
|
+
const durationMillis = parseInt(flamegraphsIntervalSec) * 1000
|
|
134
204
|
|
|
135
|
-
const
|
|
205
|
+
const profilers = {}
|
|
206
|
+
const profilersConfigs = {}
|
|
136
207
|
|
|
137
|
-
app.
|
|
208
|
+
app.setupFlamegraphs = async () => {
|
|
138
209
|
if (isFlamegraphsDisabled) {
|
|
139
|
-
app.log.info('PLT_DISABLE_FLAMEGRAPHS is set,
|
|
210
|
+
app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, skipping profiling')
|
|
140
211
|
return
|
|
141
212
|
}
|
|
142
213
|
|
|
143
|
-
|
|
214
|
+
const runtime = app.watt.runtime
|
|
215
|
+
const { applications } = await runtime.getApplications()
|
|
216
|
+
|
|
217
|
+
for (const application of applications) {
|
|
218
|
+
const appDetails = await runtime.getApplicationDetails(application.id)
|
|
219
|
+
const sourceMaps = appDetails.sourceMaps ?? false
|
|
220
|
+
profilersConfigs[application.id] = { durationMillis, sourceMaps }
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
app.requestFlamegraphs = async (options = {}) => {
|
|
225
|
+
if (isFlamegraphsDisabled) {
|
|
226
|
+
app.log.info('PLT_DISABLE_FLAMEGRAPHS is set, flamegraphs are disabled')
|
|
227
|
+
return
|
|
228
|
+
}
|
|
144
229
|
|
|
145
230
|
const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
|
|
146
231
|
if (!scalerUrl) {
|
|
@@ -150,118 +235,94 @@ async function flamegraphs (app, _opts) {
|
|
|
150
235
|
|
|
151
236
|
const runtime = app.watt.runtime
|
|
152
237
|
|
|
153
|
-
|
|
154
|
-
const { applications } = await runtime.getApplications()
|
|
155
|
-
workerIds = applications.map(app => app.id)
|
|
156
|
-
}
|
|
238
|
+
let { serviceIds, alertId, profileType = 'cpu' } = options
|
|
157
239
|
|
|
158
|
-
|
|
240
|
+
const servicesWorkers = {}
|
|
241
|
+
const workers = await runtime.getWorkers()
|
|
159
242
|
|
|
160
|
-
const
|
|
161
|
-
const
|
|
162
|
-
const
|
|
243
|
+
for (const workerId in workers) {
|
|
244
|
+
const workerInfo = workers[workerId]
|
|
245
|
+
const serviceId = workerInfo.application
|
|
163
246
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
app.log.info(
|
|
168
|
-
{ workerId, alertId }, 'Flamegraph will be attached to the alert'
|
|
169
|
-
)
|
|
170
|
-
profile.waitingAlerts.push(alertId)
|
|
171
|
-
}
|
|
247
|
+
servicesWorkers[serviceId] ??= []
|
|
248
|
+
servicesWorkers[serviceId].push(workerId)
|
|
249
|
+
}
|
|
172
250
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
251
|
+
for (const serviceId in profilers) {
|
|
252
|
+
const workerProfilers = profilers[serviceId]
|
|
253
|
+
for (const profileType in workerProfilers) {
|
|
254
|
+
const profiler = workerProfilers[profileType]
|
|
255
|
+
const workerId = profiler.workerId
|
|
256
|
+
if (workers[workerId]) continue
|
|
257
|
+
if (profiler.isProfiling) {
|
|
258
|
+
profiler.stop()
|
|
176
259
|
}
|
|
260
|
+
delete profilers[serviceId][profileType]
|
|
177
261
|
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
serviceIds ??= Object.keys(servicesWorkers)
|
|
265
|
+
|
|
266
|
+
for (const serviceId of serviceIds) {
|
|
267
|
+
profilers[serviceId] ??= {}
|
|
178
268
|
|
|
179
|
-
|
|
180
|
-
|
|
269
|
+
let profiler = profilers[serviceId][profileType]
|
|
270
|
+
if (!profiler) {
|
|
271
|
+
const workerId = servicesWorkers[serviceId][0]
|
|
272
|
+
const config = profilersConfigs[serviceId]
|
|
273
|
+
profiler = new Profiler({
|
|
274
|
+
app,
|
|
275
|
+
workerId,
|
|
181
276
|
type: profileType,
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
277
|
+
duration: config.durationMillis,
|
|
278
|
+
sourceMaps: config.sourceMaps,
|
|
279
|
+
onProfile: createProfileHandler(scalerUrl, workerId, profileType)
|
|
280
|
+
})
|
|
281
|
+
profilers[serviceId][profileType] = profiler
|
|
282
|
+
}
|
|
188
283
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
284
|
+
profiler.requestProfile({ alertId })
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
function createProfileHandler (scalerUrl, workerId, profileType) {
|
|
289
|
+
const serviceId = workerId.split(':')[0]
|
|
195
290
|
|
|
196
|
-
|
|
197
|
-
|
|
291
|
+
return async (err, profile, requests) => {
|
|
292
|
+
if (err) {
|
|
293
|
+
app.log.error({ err }, 'Failed to generate a profile')
|
|
294
|
+
return
|
|
198
295
|
}
|
|
199
296
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
serviceId,
|
|
205
|
-
profile.data,
|
|
206
|
-
profileType,
|
|
207
|
-
alertId
|
|
208
|
-
)
|
|
209
|
-
profile.flamegraphId = flamegraph.id
|
|
210
|
-
} catch (err) {
|
|
211
|
-
app.log.error({ err, workerId, alertId, profileType }, 'Failed to send flamegraph')
|
|
212
|
-
delete profilesByWorkerId[profileKey]
|
|
213
|
-
return
|
|
297
|
+
const alertIds = []
|
|
298
|
+
for (const request of requests) {
|
|
299
|
+
if (request.alertId) {
|
|
300
|
+
alertIds.push(request.alertId)
|
|
214
301
|
}
|
|
215
302
|
}
|
|
216
303
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
await _attachFlamegraphToAlerts(
|
|
304
|
+
try {
|
|
305
|
+
const alertId = alertIds.shift()
|
|
306
|
+
const flamegraph = await sendServiceFlamegraph(
|
|
221
307
|
scalerUrl,
|
|
222
308
|
serviceId,
|
|
223
|
-
profile.flamegraphId,
|
|
224
309
|
profile.data,
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
)
|
|
228
|
-
}
|
|
229
|
-
})
|
|
230
|
-
|
|
231
|
-
await Promise.all(uploadPromises)
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
async function getServiceFlamegraph (workerId, profileType, attempt = 1) {
|
|
235
|
-
const runtime = app.watt.runtime
|
|
236
|
-
|
|
237
|
-
app.log.info({ workerId, attempt, maxAttempts, attemptTimeout }, 'Getting profile from worker')
|
|
238
|
-
|
|
239
|
-
try {
|
|
240
|
-
const [state, profile] = await Promise.all([
|
|
241
|
-
runtime.sendCommandToApplication(workerId, 'getProfilingState', { type: profileType }),
|
|
242
|
-
runtime.sendCommandToApplication(workerId, 'getLastProfile', { type: profileType })
|
|
243
|
-
])
|
|
244
|
-
return { data: profile, timestamp: state.latestProfileTimestamp }
|
|
245
|
-
} catch (err) {
|
|
246
|
-
if (err.code === 'PLT_PPROF_NO_PROFILE_AVAILABLE') {
|
|
247
|
-
app.log.info(
|
|
248
|
-
{ workerId, attempt, maxAttempts, attemptTimeout },
|
|
249
|
-
'No profile available for the service. Waiting for profiling to complete.'
|
|
310
|
+
profileType,
|
|
311
|
+
alertId
|
|
250
312
|
)
|
|
251
|
-
if (attempt <= maxAttempts) {
|
|
252
|
-
await sleep(attemptTimeout)
|
|
253
|
-
return getServiceFlamegraph(workerId, profileType, attempt + 1)
|
|
254
|
-
}
|
|
255
|
-
} else if (err.code === 'PLT_PPROF_NOT_ENOUGH_ELU') {
|
|
256
|
-
app.log.info({ workerId }, 'ELU low, CPU profiling not active')
|
|
257
|
-
} else {
|
|
258
|
-
app.log.warn({ err, workerId }, 'Failed to get profile from a worker')
|
|
259
313
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
314
|
+
if (alertIds.length > 0) {
|
|
315
|
+
await _attachFlamegraphToAlerts(
|
|
316
|
+
scalerUrl,
|
|
317
|
+
serviceId,
|
|
318
|
+
flamegraph.id,
|
|
319
|
+
profile.data,
|
|
320
|
+
profileType,
|
|
321
|
+
alertIds
|
|
322
|
+
)
|
|
264
323
|
}
|
|
324
|
+
} catch (err) {
|
|
325
|
+
app.log.error({ err, workerId }, 'Failed to send flamegraph')
|
|
265
326
|
}
|
|
266
327
|
}
|
|
267
328
|
}
|
|
@@ -269,7 +330,7 @@ async function flamegraphs (app, _opts) {
|
|
|
269
330
|
async function sendServiceFlamegraph (scalerUrl, serviceId, profile, profileType, alertId) {
|
|
270
331
|
const podId = app.instanceId
|
|
271
332
|
const url = `${scalerUrl}/pods/${podId}/services/${serviceId}/flamegraph`
|
|
272
|
-
app.log.info({ serviceId, podId, profileType }, 'Sending flamegraph')
|
|
333
|
+
app.log.info({ serviceId, podId, profileType, alertId }, 'Sending flamegraph')
|
|
273
334
|
|
|
274
335
|
const query = { profileType }
|
|
275
336
|
if (alertId) {
|
|
@@ -293,8 +354,14 @@ async function flamegraphs (app, _opts) {
|
|
|
293
354
|
throw new Error(`Failed to send flamegraph: ${error}`)
|
|
294
355
|
}
|
|
295
356
|
|
|
296
|
-
const
|
|
297
|
-
|
|
357
|
+
const flamegraph = await body.json()
|
|
358
|
+
|
|
359
|
+
app.log.info(
|
|
360
|
+
{ serviceId, podId, profileType, flamegraph },
|
|
361
|
+
'Flamegraph successfully stored'
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
return flamegraph
|
|
298
365
|
}
|
|
299
366
|
|
|
300
367
|
// Function that supports ICC that doesn't have attach flamegraph API
|
|
@@ -367,15 +434,17 @@ async function flamegraphs (app, _opts) {
|
|
|
367
434
|
}
|
|
368
435
|
}
|
|
369
436
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
for (const
|
|
374
|
-
const
|
|
375
|
-
|
|
376
|
-
|
|
437
|
+
app.cleanupFlamegraphs = async () => {
|
|
438
|
+
// Stop all tracked profilers in parallel
|
|
439
|
+
const stopPromises = []
|
|
440
|
+
for (const serviceId in profilers) {
|
|
441
|
+
const serviceProfilers = profilers[serviceId]
|
|
442
|
+
for (const profileType in serviceProfilers) {
|
|
443
|
+
const profiler = serviceProfilers[profileType]
|
|
444
|
+
stopPromises.push(profiler.stop())
|
|
377
445
|
}
|
|
378
446
|
}
|
|
447
|
+
await Promise.all(stopPromises)
|
|
379
448
|
}
|
|
380
449
|
}
|
|
381
450
|
|
|
@@ -183,15 +183,10 @@ async function healthSignals (app, _opts) {
|
|
|
183
183
|
app.log.error({ error }, 'Failed to send health signals to scaler')
|
|
184
184
|
}
|
|
185
185
|
|
|
186
|
-
const
|
|
187
|
-
|
|
188
|
-
app.
|
|
189
|
-
|
|
190
|
-
workerIds: [workerId],
|
|
191
|
-
alertId: alert.id
|
|
192
|
-
}).catch(err => {
|
|
193
|
-
app.log.error({ err }, 'Failed to send a flamegraph')
|
|
194
|
-
})
|
|
186
|
+
const response = await body.json()
|
|
187
|
+
|
|
188
|
+
app.requestFlamegraphs({ serviceIds: [serviceId], alertId: response.alertId })
|
|
189
|
+
.catch(err => app.log.error({ err }, 'Failed to send a flamegraph'))
|
|
195
190
|
}
|
|
196
191
|
}
|
|
197
192
|
|
package/plugins/update.js
CHANGED
|
@@ -23,14 +23,14 @@ async function updatePlugin (app) {
|
|
|
23
23
|
// Handle trigger-flamegraph command from ICC
|
|
24
24
|
if (command === 'trigger-flamegraph') {
|
|
25
25
|
app.log.info({ command }, 'Received trigger-flamegraph command from ICC')
|
|
26
|
-
app.
|
|
26
|
+
app.requestFlamegraphs({ profileType: 'cpu' })
|
|
27
27
|
return
|
|
28
28
|
}
|
|
29
29
|
|
|
30
30
|
// Handle trigger-heapprofile command from ICC
|
|
31
31
|
if (command === 'trigger-heapprofile') {
|
|
32
32
|
app.log.info({ command }, 'Received trigger-heapprofile command from ICC')
|
|
33
|
-
app.
|
|
33
|
+
app.requestFlamegraphs({ profileType: 'heap' })
|
|
34
34
|
return
|
|
35
35
|
}
|
|
36
36
|
|