@platformatic/watt-extra 1.7.1-alpha.7 → 1.8.1-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +13 -4
- package/.github/workflows/test.yml +1 -1
- package/lib/watt.js +95 -21
- package/package.json +1 -1
- package/plugins/alerts.js +29 -15
- package/plugins/env.js +3 -3
- package/plugins/flamegraphs.js +206 -416
- package/plugins/health-signals.js +19 -11
- package/plugins/init.js +10 -0
- package/plugins/update.js +2 -2
- package/test/alerts.test.js +25 -126
- package/test/fixtures/runtime-telemetry/main/package.json +3 -0
- package/test/fixtures/runtime-telemetry/main/platformatic.json +7 -0
- package/test/fixtures/runtime-telemetry/package.json +1 -0
- package/test/fixtures/runtime-telemetry/platformatic.json +26 -0
- package/test/fixtures/service-1/routes/root.cjs +1 -13
- package/test/health-signals.test.js +10 -166
- package/test/helper.js +32 -24
- package/test/init.test.js +273 -0
- package/test/patch-config.test.js +93 -14
- package/test/shared-context.test.js +1 -0
- package/test/trigger-flamegraphs.test.js +416 -257
- package/test/profiler.test.js +0 -443
|
@@ -1,11 +1,20 @@
|
|
|
1
1
|
{
|
|
2
2
|
"permissions": {
|
|
3
3
|
"allow": [
|
|
4
|
-
"
|
|
4
|
+
"Read(//work/workspaces/workspace-platformatic/platformatic/**)",
|
|
5
|
+
"Bash(npx borp:*)",
|
|
6
|
+
"Bash(timeout 30 npx borp -c 1 --timeout=20000 ./test/trigger-flamegraphs.test.js)",
|
|
7
|
+
"Bash(xargs cat:*)",
|
|
8
|
+
"Bash(pnpm install)",
|
|
9
|
+
"Bash(find:*)",
|
|
10
|
+
"Bash(cat:*)",
|
|
11
|
+
"WebFetch(domain:github.com)",
|
|
5
12
|
"Bash(node --test:*)",
|
|
6
|
-
"Bash(for i in
|
|
7
|
-
"Bash(do echo \"
|
|
8
|
-
"Bash(done)"
|
|
13
|
+
"Bash(for i in 1 2 3)",
|
|
14
|
+
"Bash(do echo \"Run $i:\")",
|
|
15
|
+
"Bash(done)",
|
|
16
|
+
"Bash(git stash:*)",
|
|
17
|
+
"Bash(echo:*)"
|
|
9
18
|
],
|
|
10
19
|
"deny": [],
|
|
11
20
|
"ask": []
|
package/lib/watt.js
CHANGED
|
@@ -94,6 +94,49 @@ class Watt {
|
|
|
94
94
|
}
|
|
95
95
|
}
|
|
96
96
|
|
|
97
|
+
async updateInstanceConfig (instanceConfig) {
|
|
98
|
+
this.#logger.info({ applicationId: instanceConfig?.applicationId }, 'Updating instance config after ICC recovery')
|
|
99
|
+
|
|
100
|
+
const previousConfig = this.#instanceConfig
|
|
101
|
+
this.#instanceConfig = instanceConfig
|
|
102
|
+
|
|
103
|
+
// If we didn't have a config before and now we do, apply runtime updates
|
|
104
|
+
if (!previousConfig && instanceConfig && this.runtime) {
|
|
105
|
+
// Update undici interceptors
|
|
106
|
+
try {
|
|
107
|
+
const undiciConfig = this.#getUndiciConfig()
|
|
108
|
+
await this.runtime.updateUndiciInterceptors?.(undiciConfig)
|
|
109
|
+
this.#logger.info('Updated undici interceptors after ICC recovery')
|
|
110
|
+
} catch (err) {
|
|
111
|
+
this.#logger.error({ err }, 'Failed to update undici interceptors after ICC recovery')
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Update metrics config if runtime supports it
|
|
115
|
+
if (typeof this.runtime.updateMetricsConfig === 'function') {
|
|
116
|
+
try {
|
|
117
|
+
// Get current metrics config set by #configureRuntime
|
|
118
|
+
const runtimeConfig = this.runtime.getRuntimeConfig(true)
|
|
119
|
+
const currentMetrics = runtimeConfig.metrics || {}
|
|
120
|
+
|
|
121
|
+
// Merge with ICC updates
|
|
122
|
+
const updatedMetrics = {
|
|
123
|
+
...currentMetrics,
|
|
124
|
+
labels: {
|
|
125
|
+
...currentMetrics.labels,
|
|
126
|
+
applicationId: instanceConfig.applicationId
|
|
127
|
+
},
|
|
128
|
+
applicationLabel: instanceConfig.applicationMetricsLabel ?? currentMetrics.applicationLabel
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
await this.runtime.updateMetricsConfig(updatedMetrics)
|
|
132
|
+
this.#logger.info('Updated metrics config after ICC recovery')
|
|
133
|
+
} catch (err) {
|
|
134
|
+
this.#logger.error({ err }, 'Failed to update metrics config after ICC recovery')
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
97
140
|
async updateSharedContext (context) {
|
|
98
141
|
this.#sharedContext = context
|
|
99
142
|
await this.runtime?.updateSharedContext?.({ context })
|
|
@@ -351,30 +394,61 @@ class Watt {
|
|
|
351
394
|
!!this.#instanceConfig?.enableOpenTelemetry &&
|
|
352
395
|
!!this.#instanceConfig?.iccServices?.riskEngine?.url
|
|
353
396
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
type: 'otlp',
|
|
365
|
-
options: {
|
|
366
|
-
url:
|
|
367
|
-
this.#instanceConfig?.iccServices?.riskEngine?.url + '/v1/traces',
|
|
368
|
-
headers: {
|
|
369
|
-
'x-platformatic-application-id': this.#instanceConfig?.applicationId
|
|
370
|
-
},
|
|
371
|
-
keepAlive: true,
|
|
372
|
-
httpAgentOptions: {
|
|
373
|
-
rejectUnauthorized: false
|
|
374
|
-
}
|
|
397
|
+
const iccExporter = {
|
|
398
|
+
type: 'otlp',
|
|
399
|
+
options: {
|
|
400
|
+
url: this.#instanceConfig?.iccServices?.riskEngine?.url + '/v1/traces',
|
|
401
|
+
headers: {
|
|
402
|
+
'x-platformatic-application-id': this.#instanceConfig?.applicationId
|
|
403
|
+
},
|
|
404
|
+
keepAlive: true,
|
|
405
|
+
httpAgentOptions: {
|
|
406
|
+
rejectUnauthorized: false
|
|
375
407
|
}
|
|
376
408
|
}
|
|
377
409
|
}
|
|
410
|
+
|
|
411
|
+
const defaultSkip = [
|
|
412
|
+
{ method: 'GET', path: '/documentation' },
|
|
413
|
+
{ method: 'GET', path: '/documentation/json' }
|
|
414
|
+
]
|
|
415
|
+
|
|
416
|
+
// If user has no telemetry config, create default
|
|
417
|
+
if (!config.telemetry) {
|
|
418
|
+
config.telemetry = {
|
|
419
|
+
enabled: enableOpenTelemetry,
|
|
420
|
+
applicationName: `${this.#applicationName}`,
|
|
421
|
+
skip: defaultSkip,
|
|
422
|
+
exporter: iccExporter
|
|
423
|
+
}
|
|
424
|
+
return
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
// Merge with existing telemetry config
|
|
428
|
+
// Always set applicationName for taxonomy diagrams (overrides user's value)
|
|
429
|
+
config.telemetry.applicationName = `${this.#applicationName}`
|
|
430
|
+
|
|
431
|
+
// If ICC telemetry is enabled, add ICC exporter to user's exporters
|
|
432
|
+
if (enableOpenTelemetry) {
|
|
433
|
+
const userExporter = config.telemetry.exporter
|
|
434
|
+
if (!userExporter) {
|
|
435
|
+
// No user exporter, just use ICC
|
|
436
|
+
config.telemetry.exporter = iccExporter
|
|
437
|
+
} else if (Array.isArray(userExporter)) {
|
|
438
|
+
// User has array of exporters, add ICC to the list
|
|
439
|
+
config.telemetry.exporter = [...userExporter, iccExporter]
|
|
440
|
+
} else {
|
|
441
|
+
// User has single exporter, convert to array with both
|
|
442
|
+
config.telemetry.exporter = [userExporter, iccExporter]
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// Merge skip patterns
|
|
447
|
+
if (config.telemetry.skip) {
|
|
448
|
+
config.telemetry.skip = [...config.telemetry.skip, ...defaultSkip]
|
|
449
|
+
} else {
|
|
450
|
+
config.telemetry.skip = defaultSkip
|
|
451
|
+
}
|
|
378
452
|
}
|
|
379
453
|
|
|
380
454
|
#configureHttpCaching (config) {
|
package/package.json
CHANGED
package/plugins/alerts.js
CHANGED
|
@@ -1,9 +1,6 @@
|
|
|
1
1
|
import { request } from 'undici'
|
|
2
2
|
|
|
3
3
|
async function alerts (app, _opts) {
|
|
4
|
-
const pauseEluThreshold = app.env.PLT_FLAMEGRAPHS_PAUSE_ELU_TRESHOLD
|
|
5
|
-
const pauseTimeout = app.env.PLT_FLAMEGRAPHS_PAUSE_TIMEOUT
|
|
6
|
-
|
|
7
4
|
const healthCache = [] // It's OK to have this in memory, this is per-pod.
|
|
8
5
|
const podHealthWindow =
|
|
9
6
|
app.instanceConfig?.scaler?.podHealthWindow || 60 * 1000
|
|
@@ -13,6 +10,10 @@ async function alerts (app, _opts) {
|
|
|
13
10
|
const lastServicesAlertTime = {}
|
|
14
11
|
const workerStartTimes = new Map() // Track per-worker start times for grace period
|
|
15
12
|
|
|
13
|
+
// Store listener references for cleanup
|
|
14
|
+
let workerStartedListener = null
|
|
15
|
+
let healthListener = null
|
|
16
|
+
|
|
16
17
|
async function setupAlerts () {
|
|
17
18
|
const scalerAlgorithmVersion = app.instanceConfig?.scaler?.version ?? 'v1'
|
|
18
19
|
if (scalerAlgorithmVersion !== 'v1') {
|
|
@@ -40,17 +41,30 @@ async function alerts (app, _opts) {
|
|
|
40
41
|
return
|
|
41
42
|
}
|
|
42
43
|
|
|
44
|
+
const healthEventName = app.watt.runtimeSupportsNewHealthMetrics()
|
|
45
|
+
? 'application:worker:health:metrics'
|
|
46
|
+
: 'application:worker:health'
|
|
47
|
+
|
|
48
|
+
// Remove old listeners if they exist (for ICC recovery scenario)
|
|
49
|
+
if (workerStartedListener) {
|
|
50
|
+
runtime.removeListener('application:worker:started', workerStartedListener)
|
|
51
|
+
}
|
|
52
|
+
if (healthListener) {
|
|
53
|
+
runtime.removeListener(healthEventName, healthListener)
|
|
54
|
+
}
|
|
55
|
+
|
|
43
56
|
// Default start time for workers that started before the listener was registered
|
|
44
57
|
const pluginStartTime = Date.now()
|
|
45
58
|
|
|
46
59
|
// Listen for worker start events to track start times
|
|
47
|
-
|
|
60
|
+
workerStartedListener = (workerInfo) => {
|
|
48
61
|
const workerId = workerInfo?.id
|
|
49
62
|
if (workerId) {
|
|
50
63
|
workerStartTimes.set(workerId, Date.now())
|
|
51
64
|
app.log.debug({ workerId }, 'Worker started, tracking for grace period')
|
|
52
65
|
}
|
|
53
|
-
}
|
|
66
|
+
}
|
|
67
|
+
runtime.on('application:worker:started', workerStartedListener)
|
|
54
68
|
|
|
55
69
|
const processHealthInfo = async (healthInfo) => {
|
|
56
70
|
if (!healthInfo) {
|
|
@@ -64,11 +78,6 @@ async function alerts (app, _opts) {
|
|
|
64
78
|
const healthWithTimestamp = { ...healthInfo, timestamp, service: serviceId }
|
|
65
79
|
delete healthWithTimestamp.healthConfig // we don't need to store this
|
|
66
80
|
|
|
67
|
-
const elu = healthInfo.currentHealth.elu
|
|
68
|
-
if (elu >= pauseEluThreshold) {
|
|
69
|
-
app.pauseProfiling({ serviceId, timeout: pauseTimeout })
|
|
70
|
-
}
|
|
71
|
-
|
|
72
81
|
healthCache.push(healthWithTimestamp)
|
|
73
82
|
|
|
74
83
|
const cutoffTime = timestamp - podHealthWindow
|
|
@@ -144,14 +153,18 @@ async function alerts (app, _opts) {
|
|
|
144
153
|
|
|
145
154
|
const alert = await body.json()
|
|
146
155
|
|
|
147
|
-
app.
|
|
148
|
-
|
|
156
|
+
app.sendFlamegraphs({
|
|
157
|
+
workerIds: [workerId],
|
|
158
|
+
alertId: alert.id
|
|
159
|
+
}).catch(err => {
|
|
160
|
+
app.log.error({ err }, 'Failed to send a flamegraph')
|
|
161
|
+
})
|
|
149
162
|
}
|
|
150
163
|
}
|
|
151
164
|
|
|
152
165
|
if (app.watt.runtimeSupportsNewHealthMetrics()) {
|
|
153
166
|
// Runtime >= 3.18.0: Listen to health:metrics
|
|
154
|
-
|
|
167
|
+
healthListener = async (health) => {
|
|
155
168
|
if (!health) {
|
|
156
169
|
app.log.error('No health info received')
|
|
157
170
|
return
|
|
@@ -181,11 +194,12 @@ async function alerts (app, _opts) {
|
|
|
181
194
|
}
|
|
182
195
|
|
|
183
196
|
await processHealthInfo(healthInfo)
|
|
184
|
-
}
|
|
197
|
+
}
|
|
185
198
|
} else {
|
|
186
199
|
// Runtime < 3.18.0:
|
|
187
|
-
|
|
200
|
+
healthListener = processHealthInfo
|
|
188
201
|
}
|
|
202
|
+
runtime.on(healthEventName, healthListener)
|
|
189
203
|
}
|
|
190
204
|
app.setupAlerts = setupAlerts
|
|
191
205
|
}
|
package/plugins/env.js
CHANGED
|
@@ -19,10 +19,10 @@ const schema = {
|
|
|
19
19
|
PLT_CACHE_CONFIG: { type: 'string' },
|
|
20
20
|
PLT_DISABLE_FLAMEGRAPHS: { type: 'boolean', default: false },
|
|
21
21
|
PLT_FLAMEGRAPHS_INTERVAL_SEC: { type: 'number', default: 60 },
|
|
22
|
+
PLT_FLAMEGRAPHS_ELU_THRESHOLD: { type: 'number', default: 0.4 },
|
|
22
23
|
PLT_FLAMEGRAPHS_GRACE_PERIOD: { type: 'number', default: 3000 },
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
PLT_FLAMEGRAPHS_STATES_REFRESH_INTERVAL: { type: 'number', default: 10 * 1000 },
|
|
24
|
+
PLT_FLAMEGRAPHS_ATTEMPT_TIMEOUT: { type: 'number', default: 10000 },
|
|
25
|
+
PLT_FLAMEGRAPHS_CACHE_CLEANUP_INTERVAL: { type: 'number', default: 120000 },
|
|
26
26
|
PLT_JWT_EXPIRATION_OFFSET_SEC: { type: 'number', default: 60 },
|
|
27
27
|
PLT_UPDATES_RECONNECT_INTERVAL_SEC: { type: 'number', default: 1 },
|
|
28
28
|
PLT_ELU_HEALTH_SIGNAL_THRESHOLD: { type: 'number', default: 0.8 },
|