@platformatic/watt-extra 1.4.0-alpha.4 → 1.5.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/app.js +7 -2
- package/index.js +1 -0
- package/lib/watt.js +34 -28
- package/package.json +10 -8
- package/plugins/alerts.js +6 -3
- package/plugins/env.js +4 -1
- package/plugins/flamegraphs.js +35 -1
- package/plugins/health-signals.js +181 -0
- package/plugins/metadata.js +47 -9
- package/test/fixtures/service-1/routes/root.cjs +57 -39
- package/test/health-signals.test.js +130 -0
- package/test/helper.js +3 -0
- package/test/metrics.test.js +37 -4
- package/.claude/settings.local.json +0 -11
package/app.js
CHANGED
|
@@ -8,11 +8,12 @@ import scheduler from './plugins/scheduler.js'
|
|
|
8
8
|
import auth from './plugins/auth.js'
|
|
9
9
|
import update from './plugins/update.js'
|
|
10
10
|
import alert from './plugins/alerts.js'
|
|
11
|
+
import healthSignals from './plugins/health-signals.js'
|
|
11
12
|
import flamegraphs from './plugins/flamegraphs.js'
|
|
12
13
|
|
|
13
14
|
async function buildApp (logger) {
|
|
14
15
|
const app = {
|
|
15
|
-
log: logger
|
|
16
|
+
log: logger
|
|
16
17
|
}
|
|
17
18
|
|
|
18
19
|
avvio(app)
|
|
@@ -22,6 +23,7 @@ async function buildApp (logger) {
|
|
|
22
23
|
.use(auth)
|
|
23
24
|
.use(init)
|
|
24
25
|
.use(alert)
|
|
26
|
+
.use(healthSignals)
|
|
25
27
|
.use(metadata)
|
|
26
28
|
.use(compliancy)
|
|
27
29
|
.use(scheduler)
|
|
@@ -101,7 +103,7 @@ async function buildApp (logger) {
|
|
|
101
103
|
{
|
|
102
104
|
err: err.message,
|
|
103
105
|
attemptNumber: retries,
|
|
104
|
-
nextRetryMs: currentRetryInterval
|
|
106
|
+
nextRetryMs: currentRetryInterval
|
|
105
107
|
},
|
|
106
108
|
`Failed to send info to ICC, retrying in ${currentRetryInterval}ms`
|
|
107
109
|
)
|
|
@@ -112,6 +114,9 @@ async function buildApp (logger) {
|
|
|
112
114
|
|
|
113
115
|
app.close = async function close () {
|
|
114
116
|
app.log.info('Closing runtime')
|
|
117
|
+
if (app.cleanupFlamegraphs) {
|
|
118
|
+
await app.cleanupFlamegraphs()
|
|
119
|
+
}
|
|
115
120
|
if (app.watt.runtime) {
|
|
116
121
|
await app.watt.close()
|
|
117
122
|
}
|
package/index.js
CHANGED
package/lib/watt.js
CHANGED
|
@@ -151,7 +151,17 @@ class Watt {
|
|
|
151
151
|
config.server = {
|
|
152
152
|
...serverConfig,
|
|
153
153
|
hostname: this.#env.PLT_APP_HOSTNAME || serverConfig.hostname,
|
|
154
|
-
port: this.#env.PLT_APP_PORT || serverConfig.port
|
|
154
|
+
port: this.#env.PLT_APP_PORT || serverConfig.port
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const labels = {
|
|
158
|
+
serviceId: 'main',
|
|
159
|
+
instanceId: this.#instanceId
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
const applicationId = this.#instanceConfig?.applicationId
|
|
163
|
+
if (applicationId) {
|
|
164
|
+
labels.applicationId = applicationId
|
|
155
165
|
}
|
|
156
166
|
|
|
157
167
|
config.hotReload = false
|
|
@@ -159,15 +169,11 @@ class Watt {
|
|
|
159
169
|
config.metrics = {
|
|
160
170
|
server: 'hide',
|
|
161
171
|
defaultMetrics: {
|
|
162
|
-
enabled: true
|
|
172
|
+
enabled: true
|
|
163
173
|
},
|
|
164
174
|
hostname: this.#env.PLT_APP_HOSTNAME || '0.0.0.0',
|
|
165
175
|
port: this.#env.PLT_METRICS_PORT || 9090,
|
|
166
|
-
labels
|
|
167
|
-
serviceId: 'main',
|
|
168
|
-
applicationId: this.#instanceConfig?.applicationId,
|
|
169
|
-
instanceId: this.#instanceId,
|
|
170
|
-
},
|
|
176
|
+
labels,
|
|
171
177
|
applicationLabel: this.#instanceConfig?.applicationMetricsLabel ?? 'serviceId'
|
|
172
178
|
}
|
|
173
179
|
|
|
@@ -231,20 +237,20 @@ class Watt {
|
|
|
231
237
|
),
|
|
232
238
|
options: {
|
|
233
239
|
labels: {
|
|
234
|
-
applicationId: this.#instanceConfig.applicationId
|
|
240
|
+
applicationId: this.#instanceConfig.applicationId
|
|
235
241
|
},
|
|
236
242
|
bloomFilter: {
|
|
237
243
|
size: 100000,
|
|
238
|
-
errorRate: 0.01
|
|
244
|
+
errorRate: 0.01
|
|
239
245
|
},
|
|
240
246
|
maxResponseSize: 5 * 1024 * 1024, // 5MB
|
|
241
247
|
trafficInspectorOptions: {
|
|
242
248
|
url: trafficInspectorOrigin,
|
|
243
249
|
pathSendBody: join(trafficInspectorPath, '/requests'),
|
|
244
|
-
pathSendMeta: join(trafficInspectorPath, '/requests/hash')
|
|
250
|
+
pathSendMeta: join(trafficInspectorPath, '/requests/hash')
|
|
245
251
|
},
|
|
246
|
-
matchingDomains: [this.#env.PLT_APP_INTERNAL_SUB_DOMAIN]
|
|
247
|
-
}
|
|
252
|
+
matchingDomains: [this.#env.PLT_APP_INTERNAL_SUB_DOMAIN]
|
|
253
|
+
}
|
|
248
254
|
}
|
|
249
255
|
}
|
|
250
256
|
|
|
@@ -255,9 +261,9 @@ class Watt {
|
|
|
255
261
|
rules: [
|
|
256
262
|
{
|
|
257
263
|
routeToMatch: 'http://plt.slicer.default/',
|
|
258
|
-
headers: {}
|
|
259
|
-
}
|
|
260
|
-
]
|
|
264
|
+
headers: {}
|
|
265
|
+
}
|
|
266
|
+
]
|
|
261
267
|
}
|
|
262
268
|
|
|
263
269
|
// This is the cache config from ICC
|
|
@@ -309,7 +315,7 @@ class Watt {
|
|
|
309
315
|
|
|
310
316
|
return {
|
|
311
317
|
module: require.resolve('undici-slicer-interceptor'),
|
|
312
|
-
options: cacheConfig
|
|
318
|
+
options: cacheConfig
|
|
313
319
|
}
|
|
314
320
|
}
|
|
315
321
|
|
|
@@ -341,7 +347,7 @@ class Watt {
|
|
|
341
347
|
applicationName: `${this.#applicationName}`,
|
|
342
348
|
skip: [
|
|
343
349
|
{ method: 'GET', path: '/documentation' },
|
|
344
|
-
{ method: 'GET', path: '/documentation/json' }
|
|
350
|
+
{ method: 'GET', path: '/documentation/json' }
|
|
345
351
|
],
|
|
346
352
|
exporter: {
|
|
347
353
|
type: 'otlp',
|
|
@@ -349,14 +355,14 @@ class Watt {
|
|
|
349
355
|
url:
|
|
350
356
|
this.#instanceConfig?.iccServices?.riskEngine?.url + '/v1/traces',
|
|
351
357
|
headers: {
|
|
352
|
-
'x-platformatic-application-id': this.#instanceConfig?.applicationId
|
|
358
|
+
'x-platformatic-application-id': this.#instanceConfig?.applicationId
|
|
353
359
|
},
|
|
354
360
|
keepAlive: true,
|
|
355
361
|
httpAgentOptions: {
|
|
356
|
-
rejectUnauthorized: false
|
|
357
|
-
}
|
|
358
|
-
}
|
|
359
|
-
}
|
|
362
|
+
rejectUnauthorized: false
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
}
|
|
360
366
|
}
|
|
361
367
|
}
|
|
362
368
|
|
|
@@ -375,7 +381,7 @@ class Watt {
|
|
|
375
381
|
...config.httpCache,
|
|
376
382
|
cacheTagsHeader,
|
|
377
383
|
store: require.resolve('undici-cache-redis'),
|
|
378
|
-
clientOpts: httpCache
|
|
384
|
+
clientOpts: httpCache
|
|
379
385
|
}
|
|
380
386
|
}
|
|
381
387
|
|
|
@@ -384,7 +390,7 @@ class Watt {
|
|
|
384
390
|
...config.health,
|
|
385
391
|
enabled: true,
|
|
386
392
|
interval: 1000,
|
|
387
|
-
maxUnhealthyChecks: 30
|
|
393
|
+
maxUnhealthyChecks: 30
|
|
388
394
|
}
|
|
389
395
|
}
|
|
390
396
|
|
|
@@ -394,7 +400,7 @@ class Watt {
|
|
|
394
400
|
if (config.scheduler) {
|
|
395
401
|
config.scheduler = config.scheduler.map((scheduler) => ({
|
|
396
402
|
...scheduler,
|
|
397
|
-
enabled: false
|
|
403
|
+
enabled: false
|
|
398
404
|
}))
|
|
399
405
|
}
|
|
400
406
|
}
|
|
@@ -413,7 +419,7 @@ class Watt {
|
|
|
413
419
|
[
|
|
414
420
|
'@platformatic/service',
|
|
415
421
|
'@platformatic/composer',
|
|
416
|
-
'@platformatic/db'
|
|
422
|
+
'@platformatic/db'
|
|
417
423
|
].includes(app.type)
|
|
418
424
|
) {
|
|
419
425
|
await this.#configurePlatformaticServices(runtime, app)
|
|
@@ -457,8 +463,8 @@ class Watt {
|
|
|
457
463
|
adapter: 'valkey',
|
|
458
464
|
url: `valkey://${username}:${password}@${host}:${port}`,
|
|
459
465
|
prefix: keyPrefix,
|
|
460
|
-
maxTTL: 604800
|
|
461
|
-
}
|
|
466
|
+
maxTTL: 604800 // 86400 * 7
|
|
467
|
+
}
|
|
462
468
|
})
|
|
463
469
|
}
|
|
464
470
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@platformatic/watt-extra",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.5.0-alpha.0",
|
|
4
4
|
"description": "The Platformatic runtime manager",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"scripts": {
|
|
@@ -19,25 +19,27 @@
|
|
|
19
19
|
},
|
|
20
20
|
"devDependencies": {
|
|
21
21
|
"@fastify/websocket": "^11.1.0",
|
|
22
|
-
"@platformatic/composer": "^3.
|
|
23
|
-
"@platformatic/next": "^3.
|
|
24
|
-
"@platformatic/node": "^3.
|
|
25
|
-
"@platformatic/service": "^3.
|
|
22
|
+
"@platformatic/composer": "^3.14.0",
|
|
23
|
+
"@platformatic/next": "^3.14.0",
|
|
24
|
+
"@platformatic/node": "^3.14.0",
|
|
25
|
+
"@platformatic/service": "^3.14.0",
|
|
26
|
+
"atomic-sleep": "^1.0.0",
|
|
26
27
|
"borp": "^0.21.0",
|
|
27
28
|
"eslint": "9",
|
|
28
29
|
"fastify": "^5.4.0",
|
|
29
30
|
"fastify-plugin": "^5.0.1",
|
|
30
31
|
"neostandard": "^0.12.0",
|
|
31
32
|
"next": "^15.3.4",
|
|
32
|
-
"platformatic": "^3.
|
|
33
|
+
"platformatic": "^3.14.0",
|
|
33
34
|
"pprof-format": "^2.1.0",
|
|
34
35
|
"why-is-node-running": "^2.3.0"
|
|
35
36
|
},
|
|
36
37
|
"dependencies": {
|
|
37
38
|
"@datadog/pprof": "^5.9.0",
|
|
38
39
|
"@fastify/error": "^4.2.0",
|
|
39
|
-
"@platformatic/runtime": "^3.
|
|
40
|
-
"@platformatic/
|
|
40
|
+
"@platformatic/runtime": "^3.14.0",
|
|
41
|
+
"@platformatic/foundation": "^3.14.0",
|
|
42
|
+
"@platformatic/wattpm-pprof-capture": "^3.14.0",
|
|
41
43
|
"avvio": "^9.1.0",
|
|
42
44
|
"chalk": "^4.1.2",
|
|
43
45
|
"commist": "^3.2.0",
|
package/plugins/alerts.js
CHANGED
|
@@ -10,6 +10,9 @@ async function alerts (app, _opts) {
|
|
|
10
10
|
const lastServicesAlertTime = {}
|
|
11
11
|
|
|
12
12
|
async function setupAlerts () {
|
|
13
|
+
const scalerAlgorithmVersion = app.env.PLT_SCALER_ALGORITHM_VERSION
|
|
14
|
+
if (scalerAlgorithmVersion !== 'v1') return
|
|
15
|
+
|
|
13
16
|
// Skip alerts setup if ICC is not configured
|
|
14
17
|
if (!app.env.PLT_ICC_URL) {
|
|
15
18
|
app.log.info('PLT_ICC_URL not set, skipping alerts setup')
|
|
@@ -85,13 +88,13 @@ async function alerts (app, _opts) {
|
|
|
85
88
|
method: 'POST',
|
|
86
89
|
headers: {
|
|
87
90
|
'Content-Type': 'application/json',
|
|
88
|
-
...authHeaders
|
|
91
|
+
...authHeaders
|
|
89
92
|
},
|
|
90
93
|
body: JSON.stringify({
|
|
91
94
|
applicationId: app.instanceConfig?.applicationId,
|
|
92
95
|
alert: healthInfo,
|
|
93
|
-
healthHistory: healthCache
|
|
94
|
-
})
|
|
96
|
+
healthHistory: healthCache
|
|
97
|
+
})
|
|
95
98
|
})
|
|
96
99
|
|
|
97
100
|
if (statusCode !== 200) {
|
package/plugins/env.js
CHANGED
|
@@ -22,7 +22,10 @@ const schema = {
|
|
|
22
22
|
PLT_FLAMEGRAPHS_ELU_THRESHOLD: { type: 'number', default: 0.4 },
|
|
23
23
|
PLT_FLAMEGRAPHS_GRACE_PERIOD: { type: 'number', default: 3000 },
|
|
24
24
|
PLT_JWT_EXPIRATION_OFFSET_SEC: { type: 'number', default: 60 },
|
|
25
|
-
PLT_UPDATES_RECONNECT_INTERVAL_SEC: { type: 'number', default: 1 }
|
|
25
|
+
PLT_UPDATES_RECONNECT_INTERVAL_SEC: { type: 'number', default: 1 },
|
|
26
|
+
PLT_SCALER_ALGORITHM_VERSION: { type: 'string', default: 'v1', enum: ['v1', 'v2'] },
|
|
27
|
+
PLT_ELU_HEALTH_SIGNAL_THRESHOLD: { type: 'number', default: 0.9 },
|
|
28
|
+
PLT_HEAP_HEALTH_SIGNAL_THRESHOLD: { type: ['number', 'string'], default: '4GB' }
|
|
26
29
|
}
|
|
27
30
|
}
|
|
28
31
|
|
package/plugins/flamegraphs.js
CHANGED
|
@@ -80,11 +80,45 @@ async function flamegraphs (app, _opts) {
|
|
|
80
80
|
runtime.on('application:worker:started', workerStartedListener)
|
|
81
81
|
}
|
|
82
82
|
|
|
83
|
-
app.cleanupFlamegraphs = () => {
|
|
83
|
+
app.cleanupFlamegraphs = async () => {
|
|
84
84
|
if (workerStartedListener && app.watt?.runtime) {
|
|
85
85
|
app.watt.runtime.removeListener('application:worker:started', workerStartedListener)
|
|
86
86
|
workerStartedListener = null
|
|
87
87
|
}
|
|
88
|
+
|
|
89
|
+
// Explicitly stop all active profiling sessions to avoid memory corruption
|
|
90
|
+
if (!isFlamegraphsDisabled && app.watt?.runtime) {
|
|
91
|
+
try {
|
|
92
|
+
const workers = await app.watt.runtime.getWorkers()
|
|
93
|
+
const stopPromises = []
|
|
94
|
+
for (const workerFullId of Object.keys(workers)) {
|
|
95
|
+
// Stop both CPU and heap profiling on each worker
|
|
96
|
+
stopPromises.push(
|
|
97
|
+
app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'cpu' })
|
|
98
|
+
.catch(err => {
|
|
99
|
+
// Ignore errors if profiling wasn't running
|
|
100
|
+
if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
|
|
101
|
+
app.log.warn({ err, workerFullId }, 'Failed to stop CPU profiling')
|
|
102
|
+
}
|
|
103
|
+
})
|
|
104
|
+
)
|
|
105
|
+
stopPromises.push(
|
|
106
|
+
app.watt.runtime.sendCommandToApplication(workerFullId, 'stopProfiling', { type: 'heap' })
|
|
107
|
+
.catch(err => {
|
|
108
|
+
// Ignore errors if profiling wasn't running
|
|
109
|
+
if (err.code !== 'PLT_PPROF_PROFILING_NOT_STARTED') {
|
|
110
|
+
app.log.warn({ err, workerFullId }, 'Failed to stop heap profiling')
|
|
111
|
+
}
|
|
112
|
+
})
|
|
113
|
+
)
|
|
114
|
+
}
|
|
115
|
+
await Promise.all(stopPromises)
|
|
116
|
+
// Small delay to ensure native cleanup completes
|
|
117
|
+
await sleep(100)
|
|
118
|
+
} catch (err) {
|
|
119
|
+
app.log.warn({ err }, 'Failed to stop profiling during cleanup')
|
|
120
|
+
}
|
|
121
|
+
}
|
|
88
122
|
}
|
|
89
123
|
|
|
90
124
|
app.sendFlamegraphs = async (options = {}) => {
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import { request } from 'undici'
|
|
2
|
+
import { parseMemorySize } from '@platformatic/foundation'
|
|
3
|
+
|
|
4
|
+
class HealthSignalsCache {
|
|
5
|
+
#signals = []
|
|
6
|
+
#size = 100
|
|
7
|
+
|
|
8
|
+
constructor () {
|
|
9
|
+
this.#signals = []
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
add (signals) {
|
|
13
|
+
for (const signal of signals) {
|
|
14
|
+
this.#signals.push(signal)
|
|
15
|
+
}
|
|
16
|
+
if (this.#signals.length > this.#size) {
|
|
17
|
+
this.#signals.splice(0, this.#signals.length - this.#size)
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
getAll () {
|
|
22
|
+
const values = this.#signals
|
|
23
|
+
this.#signals = []
|
|
24
|
+
return values
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
async function healthSignals (app, _opts) {
|
|
29
|
+
const signalsCaches = {}
|
|
30
|
+
const servicesSendingStatuses = {}
|
|
31
|
+
|
|
32
|
+
// TODO: needed to the UI compatibility
|
|
33
|
+
// remove after depricating the Scaler v1 UI
|
|
34
|
+
const servicesMetrics = {}
|
|
35
|
+
|
|
36
|
+
async function setupHealthSignals () {
|
|
37
|
+
const scalerAlgorithmVersion = app.env.PLT_SCALER_ALGORITHM_VERSION
|
|
38
|
+
if (scalerAlgorithmVersion !== 'v2') return
|
|
39
|
+
|
|
40
|
+
const eluThreshold = app.env.PLT_ELU_HEALTH_SIGNAL_THRESHOLD
|
|
41
|
+
|
|
42
|
+
let heapThreshold = app.env.PLT_HEAP_HEALTH_SIGNAL_THRESHOLD
|
|
43
|
+
if (typeof heapThreshold === 'string') {
|
|
44
|
+
heapThreshold = parseMemorySize(heapThreshold)
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Skip alerts setup if ICC is not configured
|
|
48
|
+
if (!app.env.PLT_ICC_URL) {
|
|
49
|
+
app.log.info('PLT_ICC_URL not set, skipping alerts setup')
|
|
50
|
+
return
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
|
|
54
|
+
const runtime = app.watt.runtime
|
|
55
|
+
|
|
56
|
+
if (!scalerUrl) {
|
|
57
|
+
app.log.warn(
|
|
58
|
+
'No scaler URL found in ICC services, health alerts disabled'
|
|
59
|
+
)
|
|
60
|
+
return
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
runtime.on('application:worker:health:metrics', async (healthInfo) => {
|
|
64
|
+
if (!healthInfo) {
|
|
65
|
+
app.log.error('No health metrics info received')
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const {
|
|
69
|
+
application: serviceId,
|
|
70
|
+
currentHealth,
|
|
71
|
+
healthSignals
|
|
72
|
+
} = healthInfo
|
|
73
|
+
|
|
74
|
+
const { elu, heapUsed, heapTotal } = currentHealth
|
|
75
|
+
|
|
76
|
+
if (elu > eluThreshold) {
|
|
77
|
+
healthSignals.push({
|
|
78
|
+
type: 'elu',
|
|
79
|
+
value: currentHealth.elu,
|
|
80
|
+
description:
|
|
81
|
+
`The ${serviceId} has an ELU of ${(elu * 100).toFixed(2)} %, ` +
|
|
82
|
+
`above the maximum allowed usage of ${(eluThreshold * 100).toFixed(2)} %`,
|
|
83
|
+
timestamp: Date.now()
|
|
84
|
+
})
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (heapThreshold && heapUsed > heapThreshold) {
|
|
88
|
+
const usedHeapMb = Math.round(heapUsed / 1024 / 1024)
|
|
89
|
+
const heapThresholdMb = Math.round(heapThreshold / 1024 / 1024)
|
|
90
|
+
|
|
91
|
+
healthSignals.push({
|
|
92
|
+
type: 'heapUsed',
|
|
93
|
+
value: currentHealth.heapUsed,
|
|
94
|
+
description:
|
|
95
|
+
`The ${serviceId} is using ${usedHeapMb} MB of heap, ` +
|
|
96
|
+
`above the maximum allowed usage of ${heapThresholdMb} MB`,
|
|
97
|
+
timestamp: Date.now()
|
|
98
|
+
})
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// TODO: needed to the UI compatibility
|
|
102
|
+
// remove after depricating the Scaler v1 UI
|
|
103
|
+
servicesMetrics[serviceId] ??= { elu: 0, heapUsed: 0, heapTotal: 0 }
|
|
104
|
+
const metrics = servicesMetrics[serviceId]
|
|
105
|
+
if (elu > metrics.elu) {
|
|
106
|
+
metrics.elu = elu
|
|
107
|
+
}
|
|
108
|
+
if (heapUsed > metrics.heapUsed) {
|
|
109
|
+
metrics.heapUsed = heapUsed
|
|
110
|
+
metrics.heapTotal = heapTotal
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if (healthSignals.length > 0) {
|
|
114
|
+
await sendHealthSignalsWithTimeout(serviceId, healthSignals)
|
|
115
|
+
}
|
|
116
|
+
})
|
|
117
|
+
}
|
|
118
|
+
app.setupHealthSignals = setupHealthSignals
|
|
119
|
+
|
|
120
|
+
async function sendHealthSignalsWithTimeout (serviceId, signals) {
|
|
121
|
+
signalsCaches[serviceId] ??= new HealthSignalsCache()
|
|
122
|
+
servicesSendingStatuses[serviceId] ??= false
|
|
123
|
+
|
|
124
|
+
const signalsCache = signalsCaches[serviceId]
|
|
125
|
+
signalsCache.add(signals)
|
|
126
|
+
|
|
127
|
+
if (!servicesSendingStatuses[serviceId]) {
|
|
128
|
+
servicesSendingStatuses[serviceId] = true
|
|
129
|
+
setTimeout(async () => {
|
|
130
|
+
servicesSendingStatuses[serviceId] = false
|
|
131
|
+
|
|
132
|
+
const metrics = servicesMetrics[serviceId]
|
|
133
|
+
servicesMetrics[serviceId] = null
|
|
134
|
+
|
|
135
|
+
try {
|
|
136
|
+
const signals = signalsCache.getAll()
|
|
137
|
+
await sendHealthSignals(serviceId, signals, metrics)
|
|
138
|
+
} catch (err) {
|
|
139
|
+
app.log.error({ err }, 'Failed to send health signals to scaler')
|
|
140
|
+
}
|
|
141
|
+
}, 5000).unref()
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
async function sendHealthSignals (serviceId, signals, metrics) {
|
|
146
|
+
const scalerUrl = app.instanceConfig?.iccServices?.scaler?.url
|
|
147
|
+
const applicationId = app.instanceConfig?.applicationId
|
|
148
|
+
const authHeaders = await app.getAuthorizationHeader()
|
|
149
|
+
|
|
150
|
+
const { statusCode, body } = await request(`${scalerUrl}/signals`, {
|
|
151
|
+
method: 'POST',
|
|
152
|
+
headers: {
|
|
153
|
+
'Content-Type': 'application/json',
|
|
154
|
+
...authHeaders
|
|
155
|
+
},
|
|
156
|
+
body: JSON.stringify({
|
|
157
|
+
applicationId,
|
|
158
|
+
serviceId,
|
|
159
|
+
signals,
|
|
160
|
+
elu: metrics.elu,
|
|
161
|
+
heapUsed: metrics.heapUsed,
|
|
162
|
+
heapTotal: metrics.heapTotal
|
|
163
|
+
})
|
|
164
|
+
})
|
|
165
|
+
|
|
166
|
+
if (statusCode !== 200) {
|
|
167
|
+
const error = await body.text()
|
|
168
|
+
app.log.error({ error }, 'Failed to send health signals to scaler')
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
const alert = await body.json()
|
|
172
|
+
|
|
173
|
+
try {
|
|
174
|
+
await app.sendFlamegraphs({ serviceIds: [serviceId], alertId: alert.id })
|
|
175
|
+
} catch (err) {
|
|
176
|
+
app.log.error({ err }, 'Failed to send a flamegraph')
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
export default healthSignals
|
package/plugins/metadata.js
CHANGED
|
@@ -42,15 +42,17 @@ async function metadata (app, _opts) {
|
|
|
42
42
|
)
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
const
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
45
|
+
const workersCount = getWorkersCount(runtimeConfig)
|
|
46
|
+
for (const service of services) {
|
|
47
|
+
const serviceWorkers = workersCount[service.id]
|
|
48
|
+
if (serviceWorkers?.workers) {
|
|
49
|
+
service.workers = serviceWorkers.workers
|
|
50
|
+
}
|
|
51
|
+
if (serviceWorkers?.minWorkers) {
|
|
52
|
+
service.minWorkers = serviceWorkers.minWorkers
|
|
53
|
+
}
|
|
54
|
+
if (serviceWorkers?.maxWorkers) {
|
|
55
|
+
service.maxWorkers = serviceWorkers.maxWorkers
|
|
54
56
|
}
|
|
55
57
|
}
|
|
56
58
|
|
|
@@ -91,6 +93,42 @@ async function metadata (app, _opts) {
|
|
|
91
93
|
}
|
|
92
94
|
}
|
|
93
95
|
app.sendMetadata = sendMetadata
|
|
96
|
+
|
|
97
|
+
function getWorkersCount (runtimeConfig) {
|
|
98
|
+
const verticalScalerConfig = runtimeConfig.verticalScaler
|
|
99
|
+
const serviceWorkers = {}
|
|
100
|
+
|
|
101
|
+
for (const application of runtimeConfig.applications) {
|
|
102
|
+
const { workers } = application
|
|
103
|
+
if (!workers) continue
|
|
104
|
+
|
|
105
|
+
if (typeof workers === 'number') {
|
|
106
|
+
serviceWorkers[application.id] = {
|
|
107
|
+
workers,
|
|
108
|
+
minWorkers: workers,
|
|
109
|
+
maxWorkers: workers
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
if (typeof workers === 'object') {
|
|
113
|
+
serviceWorkers[application.id] = {
|
|
114
|
+
workers: workers.static,
|
|
115
|
+
minWorkers: workers.minimum ?? workers.static,
|
|
116
|
+
maxWorkers: workers.maximum ?? workers.static
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (verticalScalerConfig?.enabled) {
|
|
122
|
+
for (const applicationId in verticalScalerConfig.applications) {
|
|
123
|
+
const scalingConfig = verticalScalerConfig.applications[applicationId]
|
|
124
|
+
serviceWorkers[applicationId] ??= {}
|
|
125
|
+
serviceWorkers[applicationId].maxWorkers ??= scalingConfig.maxWorkers
|
|
126
|
+
serviceWorkers[applicationId].minWorkers ??= scalingConfig.minWorkers
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return serviceWorkers
|
|
131
|
+
}
|
|
94
132
|
}
|
|
95
133
|
|
|
96
134
|
export default metadata
|
|
@@ -1,48 +1,66 @@
|
|
|
1
|
-
|
|
1
|
+
'use strict'
|
|
2
2
|
|
|
3
|
-
const { join } = require(
|
|
4
|
-
const { readFile } = require(
|
|
5
|
-
const { request } = require(
|
|
3
|
+
const { join } = require('node:path')
|
|
4
|
+
const { readFile } = require('node:fs/promises')
|
|
5
|
+
const { request } = require('undici')
|
|
6
|
+
const atomicSleep = require('atomic-sleep')
|
|
6
7
|
|
|
7
8
|
module.exports = async function (fastify) {
|
|
8
|
-
fastify.get(
|
|
9
|
-
return { hello:
|
|
10
|
-
})
|
|
9
|
+
fastify.get('/example', async () => {
|
|
10
|
+
return { hello: 'world' }
|
|
11
|
+
})
|
|
11
12
|
|
|
12
|
-
fastify.get(
|
|
13
|
-
return fastify.platformatic.config
|
|
14
|
-
})
|
|
13
|
+
fastify.get('/config', async () => {
|
|
14
|
+
return fastify.platformatic.config
|
|
15
|
+
})
|
|
15
16
|
|
|
16
|
-
fastify.get(
|
|
17
|
+
fastify.get('/preprocess', async () => {
|
|
17
18
|
return {
|
|
18
|
-
base:
|
|
19
|
-
leadingSlash:
|
|
20
|
-
withPrefix:
|
|
21
|
-
externalUrl:
|
|
22
|
-
}
|
|
23
|
-
})
|
|
24
|
-
|
|
25
|
-
fastify.get(
|
|
26
|
-
const customExtFilePath = join(__dirname,
|
|
27
|
-
const customExtFile = await readFile(customExtFilePath,
|
|
28
|
-
return { data: customExtFile }
|
|
29
|
-
})
|
|
30
|
-
|
|
31
|
-
fastify.get(
|
|
32
|
-
return { env: process.env }
|
|
33
|
-
})
|
|
34
|
-
|
|
35
|
-
fastify.post(
|
|
36
|
-
const { method, url } = req.body
|
|
19
|
+
base: '~PLT_BASE_PATH',
|
|
20
|
+
leadingSlash: '/~PLT_BASE_PATH',
|
|
21
|
+
withPrefix: '~PLT_BASE_PATH/foo',
|
|
22
|
+
externalUrl: '~PLT_EXTERNAL_APP_URL'
|
|
23
|
+
}
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
fastify.get('/custom-ext-file', async () => {
|
|
27
|
+
const customExtFilePath = join(__dirname, '..', 'file.custom')
|
|
28
|
+
const customExtFile = await readFile(customExtFilePath, 'utf8')
|
|
29
|
+
return { data: customExtFile }
|
|
30
|
+
})
|
|
31
|
+
|
|
32
|
+
fastify.get('/env', async () => {
|
|
33
|
+
return { env: process.env }
|
|
34
|
+
})
|
|
35
|
+
|
|
36
|
+
fastify.post('/request', async (req) => {
|
|
37
|
+
const { method, url } = req.body
|
|
37
38
|
|
|
38
39
|
const { statusCode, headers, body } = await request(url, {
|
|
39
|
-
method: method ??
|
|
40
|
+
method: method ?? 'GET',
|
|
40
41
|
headers: {
|
|
41
|
-
|
|
42
|
-
}
|
|
43
|
-
})
|
|
44
|
-
const data = await body.text()
|
|
45
|
-
|
|
46
|
-
return { statusCode, headers, data }
|
|
47
|
-
})
|
|
48
|
-
|
|
42
|
+
'content-type': 'application/json'
|
|
43
|
+
}
|
|
44
|
+
})
|
|
45
|
+
const data = await body.text()
|
|
46
|
+
|
|
47
|
+
return { statusCode, headers, data }
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
fastify.post('/cpu-intensive', async (req) => {
|
|
51
|
+
// Simulate a CPU intensive operation
|
|
52
|
+
const timeout = req.query.timeout || 10000
|
|
53
|
+
atomicSleep(timeout)
|
|
54
|
+
|
|
55
|
+
return { status: 'ok' }
|
|
56
|
+
})
|
|
57
|
+
|
|
58
|
+
fastify.post('/custom-health-signal', async (req) => {
|
|
59
|
+
const { type, value, description } = req.body
|
|
60
|
+
await globalThis.platformatic.sendHealthSignal({
|
|
61
|
+
type,
|
|
62
|
+
value,
|
|
63
|
+
description
|
|
64
|
+
})
|
|
65
|
+
})
|
|
66
|
+
}
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import assert from 'node:assert'
|
|
2
|
+
import { test } from 'node:test'
|
|
3
|
+
import { randomUUID } from 'node:crypto'
|
|
4
|
+
import { join, dirname } from 'node:path'
|
|
5
|
+
import { fileURLToPath } from 'node:url'
|
|
6
|
+
import { setTimeout as sleep } from 'node:timers/promises'
|
|
7
|
+
import { Profile } from 'pprof-format'
|
|
8
|
+
import { request } from 'undici'
|
|
9
|
+
import { setUpEnvironment, startICC } from './helper.js'
|
|
10
|
+
import { start } from '../index.js'
|
|
11
|
+
|
|
12
|
+
const __filename = fileURLToPath(import.meta.url)
|
|
13
|
+
const __dirname = dirname(__filename)
|
|
14
|
+
|
|
15
|
+
test('should send health signals when service becomes unhealthy', async (t) => {
|
|
16
|
+
const applicationName = 'test-app'
|
|
17
|
+
const applicationId = randomUUID()
|
|
18
|
+
const applicationPath = join(__dirname, 'fixtures', 'service-1')
|
|
19
|
+
|
|
20
|
+
const receivedSignalReqs = []
|
|
21
|
+
const receivedFlamegraphReqs = []
|
|
22
|
+
|
|
23
|
+
const getAuthorizationHeader = async (headers) => {
|
|
24
|
+
return { ...headers, authorization: 'Bearer test-token' }
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const icc = await startICC(t, {
|
|
28
|
+
applicationId,
|
|
29
|
+
applicationName,
|
|
30
|
+
processSignals: (req) => {
|
|
31
|
+
assert.equal(req.headers.authorization, 'Bearer test-token')
|
|
32
|
+
receivedSignalReqs.push(req.body)
|
|
33
|
+
return { id: 'test-alert-id' }
|
|
34
|
+
},
|
|
35
|
+
processFlamegraphs: (req) => {
|
|
36
|
+
const alertId = req.query.alertId
|
|
37
|
+
assert.strictEqual(alertId, 'test-alert-id')
|
|
38
|
+
assert.strictEqual(req.headers.authorization, 'Bearer test-token')
|
|
39
|
+
receivedFlamegraphReqs.push(req.body)
|
|
40
|
+
}
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
setUpEnvironment({
|
|
44
|
+
PLT_APP_NAME: applicationName,
|
|
45
|
+
PLT_APP_DIR: applicationPath,
|
|
46
|
+
PLT_ICC_URL: 'http://127.0.0.1:3000',
|
|
47
|
+
PLT_DISABLE_FLAMEGRAPHS: false,
|
|
48
|
+
PLT_FLAMEGRAPHS_INTERVAL_SEC: 2,
|
|
49
|
+
PLT_FLAMEGRAPHS_ELU_THRESHOLD: 0,
|
|
50
|
+
PLT_SCALER_ALGORITHM_VERSION: 'v2'
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
const app = await start()
|
|
54
|
+
app.getAuthorizationHeader = getAuthorizationHeader
|
|
55
|
+
|
|
56
|
+
t.after(async () => {
|
|
57
|
+
await app.close()
|
|
58
|
+
await icc.close()
|
|
59
|
+
})
|
|
60
|
+
|
|
61
|
+
// Wait for the first flamegraph to be generated
|
|
62
|
+
await sleep(5000)
|
|
63
|
+
|
|
64
|
+
{
|
|
65
|
+
const { statusCode } = await request('http://127.0.0.1:3042/custom-health-signal', {
|
|
66
|
+
method: 'POST',
|
|
67
|
+
headers: {
|
|
68
|
+
'Content-Type': 'application/json'
|
|
69
|
+
},
|
|
70
|
+
body: JSON.stringify({
|
|
71
|
+
type: 'custom',
|
|
72
|
+
value: 42,
|
|
73
|
+
description: 'This is a custom health signal'
|
|
74
|
+
})
|
|
75
|
+
})
|
|
76
|
+
assert.strictEqual(statusCode, 200)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
{
|
|
80
|
+
const { statusCode } = await request('http://127.0.0.1:3042/cpu-intensive', {
|
|
81
|
+
method: 'POST',
|
|
82
|
+
headers: {
|
|
83
|
+
'Content-Type': 'application/json'
|
|
84
|
+
},
|
|
85
|
+
body: JSON.stringify({ timeout: 3000 })
|
|
86
|
+
})
|
|
87
|
+
assert.strictEqual(statusCode, 200)
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
assert.strictEqual(receivedSignalReqs.length, 1)
|
|
91
|
+
|
|
92
|
+
const receivedSignalReq = receivedSignalReqs[0]
|
|
93
|
+
assert.ok(receivedSignalReq, 'Alert should have been received')
|
|
94
|
+
assert.strictEqual(receivedSignalReq.applicationId, applicationId)
|
|
95
|
+
assert.strictEqual(receivedSignalReq.serviceId, 'main')
|
|
96
|
+
assert.ok(receivedSignalReq.elu > 0.9)
|
|
97
|
+
assert.ok(receivedSignalReq.heapUsed > 0)
|
|
98
|
+
assert.ok(receivedSignalReq.heapTotal > 0)
|
|
99
|
+
|
|
100
|
+
const receivedSignals = receivedSignalReq.signals
|
|
101
|
+
assert.ok(receivedSignals.length > 5)
|
|
102
|
+
|
|
103
|
+
const eluSignals = receivedSignals.filter(
|
|
104
|
+
(signal) => signal.type === 'elu'
|
|
105
|
+
)
|
|
106
|
+
const customSignals = receivedSignals.filter(
|
|
107
|
+
(signal) => signal.type === 'custom'
|
|
108
|
+
)
|
|
109
|
+
assert.strictEqual(customSignals.length, 1)
|
|
110
|
+
|
|
111
|
+
for (const receivedSignal of eluSignals) {
|
|
112
|
+
assert.strictEqual(receivedSignal.type, 'elu')
|
|
113
|
+
assert.ok(receivedSignal.value > 0.9)
|
|
114
|
+
assert.ok(receivedSignal.timestamp > 0)
|
|
115
|
+
}
|
|
116
|
+
for (const receivedSignal of customSignals) {
|
|
117
|
+
assert.strictEqual(receivedSignal.type, 'custom')
|
|
118
|
+
assert.strictEqual(receivedSignal.value, 42)
|
|
119
|
+
assert.ok(receivedSignal.timestamp > 0)
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Wait for the second flamegraph to be generated
|
|
123
|
+
await sleep(2000)
|
|
124
|
+
|
|
125
|
+
// assert.strictEqual(receivedFlamegraphReqs.length, 1)
|
|
126
|
+
|
|
127
|
+
const receivedFlamegraph = receivedFlamegraphReqs[0]
|
|
128
|
+
const profile = Profile.decode(receivedFlamegraph)
|
|
129
|
+
assert.ok(profile, 'Profile should be decoded')
|
|
130
|
+
})
|
package/test/helper.js
CHANGED
|
@@ -191,6 +191,9 @@ async function startICC (t, opts = {}) {
|
|
|
191
191
|
icc.post('/alerts', async (req) => {
|
|
192
192
|
return opts.processAlerts?.(req)
|
|
193
193
|
})
|
|
194
|
+
icc.post('/signals', async (req) => {
|
|
195
|
+
return opts.processSignals?.(req)
|
|
196
|
+
})
|
|
194
197
|
icc.post('/pods/:podId/services/:serviceId/flamegraph', async (req) => {
|
|
195
198
|
return opts.processFlamegraphs?.(req)
|
|
196
199
|
})
|
package/test/metrics.test.js
CHANGED
|
@@ -28,7 +28,7 @@ test('should generate metrics with a correct labels', async (t) => {
|
|
|
28
28
|
setUpEnvironment({
|
|
29
29
|
PLT_APP_NAME: applicationName,
|
|
30
30
|
PLT_APP_DIR: applicationPath,
|
|
31
|
-
PLT_ICC_URL: 'http://127.0.0.1:3000'
|
|
31
|
+
PLT_ICC_URL: 'http://127.0.0.1:3000'
|
|
32
32
|
})
|
|
33
33
|
|
|
34
34
|
const app = await start()
|
|
@@ -40,7 +40,7 @@ test('should generate metrics with a correct labels', async (t) => {
|
|
|
40
40
|
|
|
41
41
|
const { statusCode, body } = await request('http://127.0.0.1:9090/metrics', {
|
|
42
42
|
headers: {
|
|
43
|
-
accept: 'application/json'
|
|
43
|
+
accept: 'application/json'
|
|
44
44
|
}
|
|
45
45
|
})
|
|
46
46
|
assert.strictEqual(statusCode, 200)
|
|
@@ -77,7 +77,7 @@ test('should generate metrics with a custom metrics label', async (t) => {
|
|
|
77
77
|
setUpEnvironment({
|
|
78
78
|
PLT_APP_NAME: applicationName,
|
|
79
79
|
PLT_APP_DIR: applicationPath,
|
|
80
|
-
PLT_ICC_URL: 'http://127.0.0.1:3000'
|
|
80
|
+
PLT_ICC_URL: 'http://127.0.0.1:3000'
|
|
81
81
|
})
|
|
82
82
|
|
|
83
83
|
const app = await start()
|
|
@@ -89,7 +89,7 @@ test('should generate metrics with a custom metrics label', async (t) => {
|
|
|
89
89
|
|
|
90
90
|
const { statusCode, body } = await request('http://127.0.0.1:9090/metrics', {
|
|
91
91
|
headers: {
|
|
92
|
-
accept: 'application/json'
|
|
92
|
+
accept: 'application/json'
|
|
93
93
|
}
|
|
94
94
|
})
|
|
95
95
|
assert.strictEqual(statusCode, 200)
|
|
@@ -105,3 +105,36 @@ test('should generate metrics with a custom metrics label', async (t) => {
|
|
|
105
105
|
assert.strictEqual(labels[applicationMetricsLabel], 'main')
|
|
106
106
|
}
|
|
107
107
|
})
|
|
108
|
+
|
|
109
|
+
test('should not set an applicationId label if it is undefined', async (t) => {
|
|
110
|
+
const applicationName = 'test-app'
|
|
111
|
+
const applicationPath = join(__dirname, 'fixtures', 'service-1')
|
|
112
|
+
|
|
113
|
+
delete process.env.PLT_ICC_URL
|
|
114
|
+
|
|
115
|
+
process.env.PLT_TEST_APP_1_URL = 'http://test-app-1:3042'
|
|
116
|
+
t.after(() => {
|
|
117
|
+
delete process.env.PLT_TEST_APP_1_URL
|
|
118
|
+
})
|
|
119
|
+
|
|
120
|
+
setUpEnvironment({
|
|
121
|
+
PLT_APP_NAME: applicationName,
|
|
122
|
+
PLT_APP_DIR: applicationPath
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
const app = await start()
|
|
126
|
+
|
|
127
|
+
t.after(async () => {
|
|
128
|
+
await app.close()
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
const { statusCode, body } = await request('http://127.0.0.1:9090/metrics')
|
|
132
|
+
assert.strictEqual(statusCode, 200)
|
|
133
|
+
|
|
134
|
+
const metrics = await body.text()
|
|
135
|
+
const lines = metrics.split('\n')
|
|
136
|
+
|
|
137
|
+
for (const line of lines) {
|
|
138
|
+
assert.ok(!line.includes('applicationId'), 'applicationId label should not be set:' + line)
|
|
139
|
+
}
|
|
140
|
+
})
|