pm2-perfmonitor 2.4.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/app.js CHANGED
@@ -1,499 +1,516 @@
1
- const pmx = require('pmx')
2
- const pm2 = require('pm2')
3
- const { listAppsAsync, restartAppAsync } = require('./pm2-extra')
4
- const {
5
- parseParamToArray,
6
- parseParamToNumber,
7
- parseBool,
8
- sleepAsync,
9
- getSysCpuUsageByPid,
10
- } = require('./utils')
11
- const { defaultOptions } = require('./defaults')
12
- const { sendMessage } = require('./message')
13
- const { performPerfSampling } = require('./perf-sampler')
14
- const { sendAlert } = require('./alert')
15
- const { getJobConfHostName } = require('./job-conf')
16
-
17
- const conf = pmx.initModule({}, (err, incomingConf) => {
18
- if (err) {
19
- console.error(`[${incomingConf.module_name}] init module error:`, err)
20
- process.exit(2)
21
- }
22
-
23
- return {
24
- ...defaultOptions,
25
- ...incomingConf,
26
- }
27
- })
28
-
29
- const Probe = pmx.probe()
30
- const MODULE_NAME = conf.module_name
31
- const MODULE_ENABLED = parseBool(conf.enabled)
32
- const WORKER_INTERVAL = parseParamToNumber(conf.workerInterval)
33
- const INCLUDE_APPS = parseParamToArray(conf.includeApps)
34
- const EXCLUDE_APPS = parseParamToArray(conf.excludeApps)
35
- const ZOMBIE_DETECTION = parseBool(conf.zombieDetection)
36
- const AUTO_RESTART_WHEN_ZOMBIE_DETECTED = parseBool(
37
- conf.autoRestartWhenZombieDetected,
38
- )
39
- const ZOMBIE_MAX_HITS = parseParamToNumber(conf.zombieMaxHits)
40
- const ZOMBIE_MAX_RESTARTS = parseParamToNumber(conf.zombieMaxRestarts)
41
-
42
- const cpuOverloadDetection = parseBool(conf.cpuOverloadDetection)
43
- const cpuOverloadThreshold = parseParamToNumber(conf.cpuOverloadThreshold)
44
- const cpuOverloadMaxHits = parseParamToNumber(conf.cpuOverloadMaxHits)
45
- const enablePerfCollection = parseBool(conf.enablePerfCollection)
46
- const perfReportGenerationDir = conf.perfReportGenerationDir
47
- const flamegraphDir = conf.flamegraphDir
48
- const perfSampleDuration = parseParamToNumber(conf.perfSampleDuration)
49
- const perfSampleFrequency = parseParamToNumber(conf.perfSampleFrequency)
50
- const enableNodeInspectorCollection = parseBool(
51
- conf.enableNodeInspectorCollection,
52
- )
53
- const nodeInspectorSampleDuration = parseParamToNumber(
54
- conf.nodeInspectorSampleDuration,
55
- )
56
- const enableAlert = parseBool(conf.enableAlert)
57
- const alertCmdPath = conf.alertCmdPath
58
- const alertEnv = conf.alertEnv
59
- const alertLevel = conf.alertLevel
60
- const jobHostNameConfPath = conf.jobHostNameConfPath
61
-
62
- // 存储每个进程的 CPU 采样历史(pm_id -> [cpu1, cpu2, ...])
63
- const zombieCpuHistory = new Map()
64
- const zombieRestartHistory = new Map()
65
- const zombieRestartFailedHistory = new Map()
66
-
67
- const cpuOverloadHistory = new Map()
68
- const cpuOverloadRestartHistory = new Map()
69
- const cpuOverloadRestartFailedHistory = new Map()
70
-
71
- let isProcessCheckerRunning = false
72
-
73
- /**
74
- * perf 样本是否采集中
75
- * @type { Map<number,boolean> }
76
- */
77
- const perfSamplingStats = new Map()
78
-
79
- /**
80
- * @param {'log' | 'info' | 'error' | 'warn'} type
81
- *
82
- */
83
- const logger = (type, ...args) => {
84
- return console[type](`[${MODULE_NAME}]`, ...args)
85
- }
86
-
87
- /**
88
- * 判断是否为僵尸进程:最近 ZOMBIE_MAX_HITS 次全是 0%
89
- * @param { number[] } history
90
- */
91
- const isZombie = (history) => {
92
- return history.length >= ZOMBIE_MAX_HITS && history.every((v) => v === 0)
93
- }
94
-
95
- /**
96
- * @param { number[] } history
97
- */
98
- const isCpuOverload = (history) => {
99
- return (
100
- history.length >= cpuOverloadMaxHits &&
101
- history.every((v) => v >= cpuOverloadThreshold)
102
- )
103
- }
104
-
105
- /**
106
- * @param { number } pm_id
107
- * @param { number } appCpuUsage
108
- * @returns { number[] } 对应 pm_id 的 CPU 使用率数组
109
- */
110
- const setZombieCpuHistory = (pm_id, appCpuUsage) => {
111
- if (!zombieCpuHistory.has(pm_id)) {
112
- zombieCpuHistory.set(pm_id, [])
113
- }
114
-
115
- const history = zombieCpuHistory.get(pm_id)
116
-
117
- history.push(appCpuUsage)
118
-
119
- // 只保留最近 ZOMBIE_MAX_HITS 次记录
120
- if (history.length > ZOMBIE_MAX_HITS) {
121
- history.shift()
122
- }
123
-
124
- return history
125
- }
126
-
127
- /**
128
- * @param { number } pm_id
129
- * @param { number } appCpuUsage
130
- * @returns { number[] } 对应 pm_id 的 CPU 使用率数组
131
- */
132
- const setCpuOverloadHistory = (pm_id, appCpuUsage) => {
133
- if (!cpuOverloadHistory.has(pm_id)) {
134
- cpuOverloadHistory.set(pm_id, [])
135
- }
136
-
137
- const history = cpuOverloadHistory.get(pm_id)
138
-
139
- history.push(appCpuUsage)
140
-
141
- // 只保留最近 x 次记录
142
- if (history.length > cpuOverloadMaxHits) {
143
- history.shift()
144
- }
145
-
146
- return history
147
- }
148
-
149
- const setZombieRestartFailedHistory = (pm_id) => {
150
- if (!zombieRestartFailedHistory.has(pm_id)) {
151
- zombieRestartFailedHistory.set(pm_id, 1)
152
- } else {
153
- zombieRestartFailedHistory.set(
154
- pm_id,
155
- zombieRestartFailedHistory.get(pm_id) + 1,
156
- )
157
- }
158
- }
159
-
160
- const setCpuOverloadRestartFailedHistory = (pm_id) => {
161
- if (!cpuOverloadRestartFailedHistory.has(pm_id)) {
162
- cpuOverloadRestartFailedHistory.set(pm_id, 1)
163
- } else {
164
- cpuOverloadRestartFailedHistory.set(
165
- pm_id,
166
- cpuOverloadRestartFailedHistory.get(pm_id) + 1,
167
- )
168
- }
169
- }
170
-
171
- /**
172
- * 发送重启警告
173
- * @param {string} title
174
- * @param {string} message
175
- */
176
- const sendRestartAlert = async (title, message) => {
177
- if (!enableAlert) return
178
-
179
- const datetime = new Date().toLocaleString()
180
- const jobHostName = getJobConfHostName(jobHostNameConfPath)
181
-
182
- return await sendAlert({
183
- cmd: alertCmdPath,
184
- env: alertEnv,
185
- level: alertLevel,
186
- title: `[${MODULE_NAME}] [${datetime}] Alert: ${title}`,
187
- content: `[${jobHostName}] - ${message}`,
188
- })
189
- }
190
-
191
- /**
192
- * check process
193
- */
194
- const processChecker = async () => {
195
- if (isProcessCheckerRunning) return
196
-
197
- try {
198
- isProcessCheckerRunning = true
199
-
200
- const apps = await listAppsAsync()
201
-
202
- for (const app of apps) {
203
- const { name, pid, pm_id, monit, pm2_env } = app
204
-
205
- const appStatus = pm2_env?.status
206
-
207
- // 非目标应用,跳过
208
- if (
209
- MODULE_NAME === name ||
210
- (INCLUDE_APPS.length > 0 && !INCLUDE_APPS.includes(name)) ||
211
- (EXCLUDE_APPS.length > 0 && EXCLUDE_APPS.includes(name))
212
- ) {
213
- continue
214
- }
215
-
216
- // 只处理 online 状态的进程
217
- if (appStatus !== 'online') {
218
- // 进程不在 online 状态时,清空其历史记录,避免干扰
219
- zombieCpuHistory.delete(pm_id)
220
- cpuOverloadHistory.delete(pm_id)
221
-
222
- continue
223
- }
224
-
225
- const pm2CpuUsage = monit?.cpu
226
- const sysCpuUsage = await getSysCpuUsageByPid(pid)
227
- const appCpuUsage =
228
- typeof sysCpuUsage === 'number' ? sysCpuUsage : pm2CpuUsage
229
-
230
- const history = setZombieCpuHistory(pm_id, appCpuUsage)
231
- const history2 = setCpuOverloadHistory(pm_id, appCpuUsage)
232
-
233
- // 发送消息通知对应应用进程,采样 CPU 性能
234
- if (enableNodeInspectorCollection) {
235
- if (appCpuUsage >= cpuOverloadThreshold) {
236
- await sendMessage(pm_id, 'cpu-profile-start')
237
- await sleepAsync(nodeInspectorSampleDuration * 1000)
238
- await sendMessage(pm_id, 'cpu-profile-stop')
239
- }
240
- }
241
-
242
- // 判断是否为僵尸:最近 ZOMBIE_MAX_HITS 次全是 0%
243
- if (ZOMBIE_DETECTION && isZombie(history)) {
244
- logger(
245
- 'info',
246
- `Zombie detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
247
- )
248
-
249
- if (AUTO_RESTART_WHEN_ZOMBIE_DETECTED) {
250
- if (
251
- ZOMBIE_MAX_RESTARTS > 0 &&
252
- zombieRestartHistory.get(pm_id) >= ZOMBIE_MAX_RESTARTS
253
- ) {
254
- continue
255
- }
256
-
257
- logger('info', 'restarting...')
258
-
259
- try {
260
- await restartAppAsync(pm_id)
261
-
262
- if (!zombieRestartHistory.has(pm_id)) {
263
- zombieRestartHistory.set(pm_id, 1)
264
- } else {
265
- const history = zombieRestartHistory.get(pm_id)
266
-
267
- zombieRestartHistory.set(pm_id, history + 1)
268
- }
269
-
270
- logger(
271
- 'info',
272
- `[ZOMBIE] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${zombieRestartHistory.get(pm_id)} times`,
273
- )
274
-
275
- await sendRestartAlert(
276
- `The zombie process has been restarted!`,
277
- `appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${zombieRestartHistory.get(pm_id)} times`,
278
- )
279
-
280
- // 重启后清除该进程的历史记录,避免刚重启又被判定为僵尸
281
- zombieCpuHistory.delete(pm_id)
282
- } catch (restartErr) {
283
- logger(
284
- 'error',
285
- `[ZOMBIE] Restart failed for ${name} (pm_id: ${pm_id}):`,
286
- restartErr,
287
- )
288
-
289
- setZombieRestartFailedHistory(pm_id)
290
- }
291
- }
292
- }
293
- // CPU 是否持续过载
294
- else if (cpuOverloadDetection && isCpuOverload(history2)) {
295
- logger(
296
- 'info',
297
- `CPU Overload detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
298
- )
299
-
300
- if (enablePerfCollection) {
301
- perfSamplingStats.set(pm_id, true)
302
-
303
- await performPerfSampling({
304
- pid,
305
- moduleName: MODULE_NAME,
306
- perfDir: perfReportGenerationDir,
307
- flamegraphDir,
308
- sampleDuration: perfSampleDuration,
309
- sampleFrequency: perfSampleFrequency,
310
- })
311
-
312
- perfSamplingStats.delete(pm_id)
313
- }
314
-
315
- try {
316
- logger('info', 'restarting...')
317
-
318
- await restartAppAsync(pm_id)
319
-
320
- if (!cpuOverloadRestartHistory.has(pm_id)) {
321
- cpuOverloadRestartHistory.set(pm_id, 1)
322
- } else {
323
- cpuOverloadRestartHistory.set(
324
- pm_id,
325
- cpuOverloadRestartHistory.get(pm_id) + 1,
326
- )
327
- }
328
-
329
- logger(
330
- 'info',
331
- `[CPU OVERLOAD] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${cpuOverloadRestartHistory.get(pm_id)} times`,
332
- )
333
-
334
- await sendRestartAlert(
335
- `CPU overload process restarted!`,
336
- `appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${cpuOverloadRestartHistory.get(pm_id)} times`,
337
- )
338
-
339
- cpuOverloadHistory.delete(pm_id)
340
- } catch (restartErr) {
341
- logger(
342
- 'error',
343
- `[CPU OVERLOAD] Restart failed for ${name} (pm_id: ${pm_id}):`,
344
- restartErr,
345
- )
346
-
347
- setCpuOverloadRestartFailedHistory(pm_id)
348
- }
349
- }
350
- }
351
- } catch (err) {
352
- logger('error', err)
353
- } finally {
354
- isProcessCheckerRunning = false
355
- }
356
- }
357
-
358
- const runModule = () => {
359
- if (!MODULE_ENABLED) return
360
-
361
- // connect to local pm2
362
- pm2.connect((err) => {
363
- if (err) {
364
- logger('error', `PM2 connection error:`, err)
365
-
366
- process.exit(1)
367
- }
368
-
369
- logger('info', 'Connected to PM2, starting monitor...')
370
-
371
- processChecker()
372
-
373
- setInterval(() => {
374
- processChecker()
375
- }, WORKER_INTERVAL)
376
- })
377
-
378
- /** PROB PMX **/
379
- Probe.metric({
380
- name: 'Zombie Restarts',
381
- value: () => {
382
- const res = []
383
-
384
- for (const [k, v] of zombieRestartHistory) {
385
- if (v > 0) {
386
- res.push([k, v])
387
- }
388
- }
389
-
390
- if (!res.length) return 'N/A'
391
-
392
- return res.map((v) => `[${v[0]}]:${v[1]}`).join(' ; ')
393
- },
394
- })
395
-
396
- Probe.metric({
397
- name: 'Zombie Restarts (failed)',
398
- value: () => {
399
- const res = []
400
-
401
- for (const [k, v] of zombieRestartFailedHistory) {
402
- if (v > 0) {
403
- res.push([k, v])
404
- }
405
- }
406
-
407
- if (!res.length) return 'N/A'
408
-
409
- return res.map((v) => `[${v[0]}]:${v[1]}`).join(' ; ')
410
- },
411
- })
412
-
413
- Probe.metric({
414
- name: 'Zombie Processes',
415
- value: () => {
416
- const res = []
417
-
418
- for (const [pmId, arr] of zombieCpuHistory) {
419
- if (isZombie(arr)) {
420
- res.push(pmId)
421
- }
422
- }
423
-
424
- if (!res.length) return 'N/A'
425
-
426
- return res.join(',')
427
- },
428
- })
429
-
430
- Probe.metric({
431
- name: 'CPU Overload Restarts',
432
- value: () => {
433
- const res = []
434
-
435
- for (const [k, v] of cpuOverloadRestartHistory) {
436
- if (v > 0) {
437
- res.push([k, v])
438
- }
439
- }
440
-
441
- if (!res.length) return 'N/A'
442
-
443
- return res.map((v) => `[${v[0]}]:${v[1]}`).join(' ; ')
444
- },
445
- })
446
-
447
- Probe.metric({
448
- name: 'CPU Overload Processes',
449
- value: () => {
450
- const res = []
451
-
452
- for (const [pmId, arr] of cpuOverloadHistory) {
453
- if (isCpuOverload(arr)) {
454
- res.push(pmId)
455
- }
456
- }
457
-
458
- if (!res.length) return 'N/A'
459
-
460
- return res.join(',')
461
- },
462
- })
463
-
464
- Probe.metric({
465
- name: 'CPU Overload Restarts (failed)',
466
- value: () => {
467
- const res = []
468
-
469
- for (const [k, v] of cpuOverloadRestartFailedHistory) {
470
- if (v > 0) {
471
- res.push([k, v])
472
- }
473
- }
474
-
475
- if (!res.length) return 'N/A'
476
-
477
- return res.map((v) => `[${v[0]}]:${v[1]}`).join(' ; ')
478
- },
479
- })
480
-
481
- Probe.metric({
482
- name: 'Processes in Sampling (perf)',
483
- value: () => {
484
- const res = []
485
-
486
- for (const [k, v] of perfSamplingStats) {
487
- if (v === true) {
488
- res.push(k)
489
- }
490
- }
491
-
492
- if (!res.length) return 'N/A'
493
-
494
- return res.join(', ')
495
- },
496
- })
497
- }
498
-
499
- runModule()
1
+ const pmx = require('pmx')
2
+ const pm2 = require('pm2')
3
+
4
+ const { listAppsAsync, restartAppAsync } = require('./pm2-extra')
5
+ const {
6
+ parseParamToArray,
7
+ parseParamToNumber,
8
+ parseBool,
9
+ sleepAsync,
10
+ getSysCpuUsageByPid,
11
+ } = require('./utils')
12
+ const { defaultOptions } = require('./defaults')
13
+ const { sendMessage } = require('./message')
14
+ const { performPerfSampling } = require('./perf-sampler')
15
+ const { sendAlert } = require('./alert')
16
+ const { getJobConfHostName } = require('./job-conf')
17
+ const { isZombieCpuProcess, isZombieStateProcess } = require('./zombie-check')
18
+
19
+ /**
20
+ * @type { defaultOptions }
21
+ */
22
+ const conf = pmx.initModule({}, (err, incomingConf) => {
23
+ if (err) {
24
+ console.error(`[${incomingConf.module_name}] init module error:`, err)
25
+ process.exit(2)
26
+ }
27
+
28
+ return {
29
+ ...defaultOptions,
30
+ ...incomingConf,
31
+ }
32
+ })
33
+
34
+ const Probe = pmx.probe()
35
+ const MODULE_NAME = conf.module_name
36
+ const MODULE_ENABLED = parseBool(conf.enabled)
37
+ const WORKER_INTERVAL = parseParamToNumber(conf.workerInterval)
38
+ const INCLUDE_APPS = parseParamToArray(conf.includeApps)
39
+ const EXCLUDE_APPS = parseParamToArray(conf.excludeApps)
40
+
41
+ // zombie conf
42
+ const ZOMBIE_DETECTION = parseBool(conf.zombieDetection)
43
+ const AUTO_RESTART_WHEN_ZOMBIE_DETECTED = parseBool(
44
+ conf.autoRestartWhenZombieDetected,
45
+ )
46
+ const ZOMBIE_MAX_HITS = parseParamToNumber(conf.zombieMaxHits)
47
+ const ZOMBIE_MAX_RESTARTS = parseParamToNumber(conf.zombieMaxRestarts)
48
+ const zombieProcessDetectionStrategy = conf.zombieProcessDetectionStrategy
49
+
50
+ // cpu conf
51
+ const cpuOverloadDetection = parseBool(conf.cpuOverloadDetection)
52
+ const cpuOverloadThreshold = parseParamToNumber(conf.cpuOverloadThreshold)
53
+ const cpuOverloadMaxHits = parseParamToNumber(conf.cpuOverloadMaxHits)
54
+ const enablePerfCollection = parseBool(conf.enablePerfCollection)
55
+ const perfReportGenerationDir = conf.perfReportGenerationDir
56
+ const flamegraphDir = conf.flamegraphDir
57
+ const perfSampleDuration = parseParamToNumber(conf.perfSampleDuration)
58
+ const perfSampleFrequency = parseParamToNumber(conf.perfSampleFrequency)
59
+ const enableNodeInspectorCollection = parseBool(
60
+ conf.enableNodeInspectorCollection,
61
+ )
62
+ const nodeInspectorSampleDuration = parseParamToNumber(
63
+ conf.nodeInspectorSampleDuration,
64
+ )
65
+
66
+ // alert conf
67
+ const enableAlert = parseBool(conf.enableAlert)
68
+ const alertCmdPath = conf.alertCmdPath
69
+ const alertEnv = conf.alertEnv
70
+ const alertLevel = conf.alertLevel
71
+ const jobHostNameConfPath = conf.jobHostNameConfPath
72
+
73
+ // 存储每个进程的 CPU 采样历史(pm_id -> [cpu1, cpu2, ...])
74
+ const zombieCpuHistory = new Map()
75
+ const zombieRestartHistory = new Map()
76
+ const zombieRestartFailedHistory = new Map()
77
+
78
+ const cpuOverloadHistory = new Map()
79
+ const cpuOverloadRestartHistory = new Map()
80
+ const cpuOverloadRestartFailedHistory = new Map()
81
+
82
+ let isProcessCheckerRunning = false
83
+
84
+ /**
85
+ * perf 样本是否采集中
86
+ * @type { Map<number,boolean> }
87
+ */
88
+ const perfSamplingStats = new Map()
89
+
90
+ /**
91
+ * @param {'log' | 'info' | 'error' | 'warn'} type
92
+ *
93
+ */
94
+ const logger = (type, ...args) => {
95
+ return console[type](`[${MODULE_NAME}]`, ...args)
96
+ }
97
+
98
+ /**
99
+ * 判断是否为僵尸进程:最近 ZOMBIE_MAX_HITS 次全是 0%
100
+ * @param { number } pid
101
+ * @param { number[] } cpus
102
+ */
103
+ const isZombieProcess = async (pid, cpus) => {
104
+ if (zombieProcessDetectionStrategy === 'zombie-state') {
105
+ const res = await isZombieStateProcess(pid)
106
+
107
+ return res.isZombie
108
+ } else if (zombieProcessDetectionStrategy === 'zero-cpu-consecutive') {
109
+ return isZombieCpuProcess({
110
+ cpus,
111
+ maxHits: ZOMBIE_MAX_HITS,
112
+ })
113
+ } else if (zombieProcessDetectionStrategy === 'zombie-state-and-zero-cpu') {
114
+ const { isZombie: zombieState, failed } = await isZombieStateProcess(pid)
115
+
116
+ const zombieCpu = isZombieCpuProcess({
117
+ cpus,
118
+ maxHits: ZOMBIE_MAX_HITS,
119
+ })
120
+
121
+ // 获取系统进程状态失败时,回退为通过 CPU 持续 0% 判定
122
+ if (failed) {
123
+ return zombieCpu
124
+ }
125
+
126
+ return zombieState && zombieCpu
127
+ } else if (zombieProcessDetectionStrategy === 'zombie-state-or-zero-cpu') {
128
+ const { isZombie: zombieState } = await isZombieStateProcess(pid)
129
+
130
+ const zombieCpu = isZombieCpuProcess({
131
+ cpus,
132
+ maxHits: ZOMBIE_MAX_HITS,
133
+ })
134
+
135
+ return zombieState || zombieCpu
136
+ }
137
+
138
+ return isZombieCpuProcess({
139
+ cpus,
140
+ maxHits: ZOMBIE_MAX_HITS,
141
+ })
142
+ }
143
+
144
+ /**
145
+ * @param { number[] } cpus
146
+ */
147
+ const isCpuOverload = (cpus) => {
148
+ return (
149
+ cpus.length >= cpuOverloadMaxHits &&
150
+ cpus.every((v) => v >= cpuOverloadThreshold)
151
+ )
152
+ }
153
+
154
+ /**
155
+ * @param { number } pm_id
156
+ * @param { number } appCpuUsage
157
+ * @returns { number[] } 对应 pm_id 的 CPU 使用率数组
158
+ */
159
+ const setZombieCpuHistory = (pm_id, appCpuUsage) => {
160
+ if (!zombieCpuHistory.has(pm_id)) {
161
+ zombieCpuHistory.set(pm_id, [])
162
+ }
163
+
164
+ const history = zombieCpuHistory.get(pm_id)
165
+
166
+ history.push(appCpuUsage)
167
+
168
+ // 只保留最近 ZOMBIE_MAX_HITS 次记录
169
+ if (history.length > ZOMBIE_MAX_HITS) {
170
+ history.shift()
171
+ }
172
+
173
+ return history
174
+ }
175
+
176
+ /**
177
+ * @param { number } pm_id
178
+ * @param { number } appCpuUsage
179
+ * @returns { number[] } 对应 pm_id 的 CPU 使用率数组
180
+ */
181
+ const setCpuOverloadHistory = (pm_id, appCpuUsage) => {
182
+ if (!cpuOverloadHistory.has(pm_id)) {
183
+ cpuOverloadHistory.set(pm_id, [])
184
+ }
185
+
186
+ const history = cpuOverloadHistory.get(pm_id)
187
+
188
+ history.push(appCpuUsage)
189
+
190
+ // 只保留最近 x 次记录
191
+ if (history.length > cpuOverloadMaxHits) {
192
+ history.shift()
193
+ }
194
+
195
+ return history
196
+ }
197
+
198
+ const setZombieRestartFailedHistory = (pm_id) => {
199
+ if (!zombieRestartFailedHistory.has(pm_id)) {
200
+ zombieRestartFailedHistory.set(pm_id, 1)
201
+ } else {
202
+ zombieRestartFailedHistory.set(
203
+ pm_id,
204
+ zombieRestartFailedHistory.get(pm_id) + 1,
205
+ )
206
+ }
207
+ }
208
+
209
+ const setCpuOverloadRestartFailedHistory = (pm_id) => {
210
+ if (!cpuOverloadRestartFailedHistory.has(pm_id)) {
211
+ cpuOverloadRestartFailedHistory.set(pm_id, 1)
212
+ } else {
213
+ cpuOverloadRestartFailedHistory.set(
214
+ pm_id,
215
+ cpuOverloadRestartFailedHistory.get(pm_id) + 1,
216
+ )
217
+ }
218
+ }
219
+
220
+ /**
221
+ * 发送重启警告
222
+ * @param {string} title
223
+ * @param {string} message
224
+ */
225
+ const sendRestartAlert = async (title, message) => {
226
+ if (!enableAlert) return
227
+
228
+ const datetime = new Date().toLocaleString()
229
+ const jobHostName = getJobConfHostName(jobHostNameConfPath)
230
+
231
+ return await sendAlert({
232
+ cmd: alertCmdPath,
233
+ env: alertEnv,
234
+ level: alertLevel,
235
+ title: `[${MODULE_NAME}] Alert: ${title}`,
236
+ content: `[${jobHostName}] [${datetime}] - ${message}`,
237
+ })
238
+ }
239
+
240
+ /**
241
+ * check process
242
+ */
243
+ const processChecker = async () => {
244
+ if (isProcessCheckerRunning) return
245
+
246
+ try {
247
+ isProcessCheckerRunning = true
248
+
249
+ const apps = await listAppsAsync()
250
+
251
+ for (const app of apps) {
252
+ const { name, pid, pm_id, monit, pm2_env } = app
253
+
254
+ const appStatus = pm2_env?.status
255
+
256
+ // 非目标应用,跳过
257
+ if (
258
+ MODULE_NAME === name ||
259
+ (INCLUDE_APPS.length > 0 && !INCLUDE_APPS.includes(name)) ||
260
+ (EXCLUDE_APPS.length > 0 && EXCLUDE_APPS.includes(name))
261
+ ) {
262
+ continue
263
+ }
264
+
265
+ // 只处理 online 状态的进程
266
+ if (appStatus !== 'online') {
267
+ // 进程不在 online 状态时,清空其历史记录,避免干扰
268
+ zombieCpuHistory.delete(pm_id)
269
+ cpuOverloadHistory.delete(pm_id)
270
+
271
+ continue
272
+ }
273
+
274
+ const pm2CpuUsage = monit?.cpu
275
+ const sysCpuUsage = await getSysCpuUsageByPid(pid)
276
+ const appCpuUsage =
277
+ typeof sysCpuUsage === 'number' ? sysCpuUsage : pm2CpuUsage
278
+
279
+ const cpuHistory = setZombieCpuHistory(pm_id, appCpuUsage)
280
+ const cpuHistory2 = setCpuOverloadHistory(pm_id, appCpuUsage)
281
+
282
+ // 发送消息通知对应应用进程,采样 CPU 性能
283
+ if (enableNodeInspectorCollection) {
284
+ if (appCpuUsage >= cpuOverloadThreshold) {
285
+ await sendMessage(pm_id, 'cpu-profile-start')
286
+ await sleepAsync(nodeInspectorSampleDuration * 1000)
287
+ await sendMessage(pm_id, 'cpu-profile-stop')
288
+ }
289
+ }
290
+
291
+ const zombieDetected = await isZombieProcess(pid, cpuHistory)
292
+
293
+ // 判断是否为僵尸:最近 ZOMBIE_MAX_HITS 次全是 0%
294
+ if (ZOMBIE_DETECTION && zombieDetected) {
295
+ logger(
296
+ 'info',
297
+ `Zombie detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
298
+ )
299
+
300
+ if (AUTO_RESTART_WHEN_ZOMBIE_DETECTED) {
301
+ if (
302
+ ZOMBIE_MAX_RESTARTS > 0 &&
303
+ zombieRestartHistory.get(pm_id) >= ZOMBIE_MAX_RESTARTS
304
+ ) {
305
+ continue
306
+ }
307
+
308
+ logger('info', 'restarting...')
309
+
310
+ try {
311
+ await restartAppAsync(pm_id)
312
+
313
+ if (!zombieRestartHistory.has(pm_id)) {
314
+ zombieRestartHistory.set(pm_id, 1)
315
+ } else {
316
+ const history = zombieRestartHistory.get(pm_id)
317
+
318
+ zombieRestartHistory.set(pm_id, history + 1)
319
+ }
320
+
321
+ logger(
322
+ 'info',
323
+ `[ZOMBIE] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${zombieRestartHistory.get(pm_id)} times`,
324
+ )
325
+
326
+ await sendRestartAlert(
327
+ `The zombie process has been restarted!`,
328
+ `appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${zombieRestartHistory.get(pm_id)} times`,
329
+ )
330
+
331
+ // 重启后清除该进程的历史记录,避免刚重启又被判定为僵尸
332
+ zombieCpuHistory.delete(pm_id)
333
+ } catch (restartErr) {
334
+ logger(
335
+ 'error',
336
+ `[ZOMBIE] Restart failed for ${name} (pm_id: ${pm_id}):`,
337
+ restartErr,
338
+ )
339
+
340
+ setZombieRestartFailedHistory(pm_id)
341
+ }
342
+ }
343
+ }
344
+ // CPU 是否持续过载
345
+ else if (cpuOverloadDetection && isCpuOverload(cpuHistory2)) {
346
+ logger(
347
+ 'info',
348
+ `CPU Overload detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
349
+ )
350
+
351
+ if (enablePerfCollection) {
352
+ perfSamplingStats.set(pm_id, true)
353
+
354
+ await performPerfSampling({
355
+ pid,
356
+ moduleName: MODULE_NAME,
357
+ perfDir: perfReportGenerationDir,
358
+ flamegraphDir,
359
+ sampleDuration: perfSampleDuration,
360
+ sampleFrequency: perfSampleFrequency,
361
+ })
362
+
363
+ perfSamplingStats.delete(pm_id)
364
+ }
365
+
366
+ try {
367
+ logger('info', 'restarting...')
368
+
369
+ await restartAppAsync(pm_id)
370
+
371
+ if (!cpuOverloadRestartHistory.has(pm_id)) {
372
+ cpuOverloadRestartHistory.set(pm_id, 1)
373
+ } else {
374
+ cpuOverloadRestartHistory.set(
375
+ pm_id,
376
+ cpuOverloadRestartHistory.get(pm_id) + 1,
377
+ )
378
+ }
379
+
380
+ logger(
381
+ 'info',
382
+ `[CPU OVERLOAD] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${cpuOverloadRestartHistory.get(pm_id)} times`,
383
+ )
384
+
385
+ await sendRestartAlert(
386
+ `CPU overload process restarted!`,
387
+ `appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${cpuOverloadRestartHistory.get(pm_id)} times`,
388
+ )
389
+
390
+ cpuOverloadHistory.delete(pm_id)
391
+ } catch (restartErr) {
392
+ logger(
393
+ 'error',
394
+ `[CPU OVERLOAD] Restart failed for ${name} (pm_id: ${pm_id}):`,
395
+ restartErr,
396
+ )
397
+
398
+ setCpuOverloadRestartFailedHistory(pm_id)
399
+ }
400
+ }
401
+ }
402
+ } catch (err) {
403
+ logger('error', err)
404
+ } finally {
405
+ isProcessCheckerRunning = false
406
+ }
407
+ }
408
+
409
+ const runModule = () => {
410
+ if (!MODULE_ENABLED) return
411
+
412
+ // connect to local pm2
413
+ pm2.connect((err) => {
414
+ if (err) {
415
+ logger('error', `PM2 connection error:`, err)
416
+
417
+ process.exit(1)
418
+ }
419
+
420
+ logger('info', 'Connected to PM2, starting monitor...')
421
+
422
+ processChecker()
423
+
424
+ setInterval(() => {
425
+ processChecker()
426
+ }, WORKER_INTERVAL)
427
+ })
428
+
429
+ /** PROB PMX **/
430
+ Probe.metric({
431
+ name: 'Zombie Restarts',
432
+ value: () => {
433
+ const res = []
434
+
435
+ for (const [k, v] of zombieRestartHistory) {
436
+ if (v > 0) {
437
+ res.push([k, v])
438
+ }
439
+ }
440
+
441
+ if (!res.length) return 'N/A'
442
+
443
+ return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
444
+ },
445
+ })
446
+
447
+ Probe.metric({
448
+ name: 'Zombie Restarts (failed)',
449
+ value: () => {
450
+ const res = []
451
+
452
+ for (const [k, v] of zombieRestartFailedHistory) {
453
+ if (v > 0) {
454
+ res.push([k, v])
455
+ }
456
+ }
457
+
458
+ if (!res.length) return 'N/A'
459
+
460
+ return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
461
+ },
462
+ })
463
+
464
+ Probe.metric({
465
+ name: 'CPU Overload Restarts',
466
+ value: () => {
467
+ const res = []
468
+
469
+ for (const [k, v] of cpuOverloadRestartHistory) {
470
+ if (v > 0) {
471
+ res.push([k, v])
472
+ }
473
+ }
474
+
475
+ if (!res.length) return 'N/A'
476
+
477
+ return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
478
+ },
479
+ })
480
+
481
+ Probe.metric({
482
+ name: 'CPU Overload Restarts (failed)',
483
+ value: () => {
484
+ const res = []
485
+
486
+ for (const [k, v] of cpuOverloadRestartFailedHistory) {
487
+ if (v > 0) {
488
+ res.push([k, v])
489
+ }
490
+ }
491
+
492
+ if (!res.length) return 'N/A'
493
+
494
+ return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
495
+ },
496
+ })
497
+
498
+ Probe.metric({
499
+ name: 'Processes in Sampling (perf)',
500
+ value: () => {
501
+ const res = []
502
+
503
+ for (const [k, v] of perfSamplingStats) {
504
+ if (v === true) {
505
+ res.push(k)
506
+ }
507
+ }
508
+
509
+ if (!res.length) return 'N/A'
510
+
511
+ return res.join(', ')
512
+ },
513
+ })
514
+ }
515
+
516
+ runModule()