pm2-perfmonitor 2.5.2 → 2.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/app.js CHANGED
@@ -1,516 +1,581 @@
1
- const pmx = require('pmx')
2
- const pm2 = require('pm2')
3
-
4
- const { listAppsAsync, restartAppAsync } = require('./pm2-extra')
5
- const {
6
- parseParamToArray,
7
- parseParamToNumber,
8
- parseBool,
9
- sleepAsync,
10
- getSysCpuUsageByPid,
11
- } = require('./utils')
12
- const { defaultOptions } = require('./defaults')
13
- const { sendMessage } = require('./message')
14
- const { performPerfSampling } = require('./perf-sampler')
15
- const { sendAlert } = require('./alert')
16
- const { getJobConfHostName } = require('./job-conf')
17
- const { isZombieCpuProcess, isZombieStateProcess } = require('./zombie-check')
18
-
19
- /**
20
- * @type { defaultOptions }
21
- */
22
- const conf = pmx.initModule({}, (err, incomingConf) => {
23
- if (err) {
24
- console.error(`[${incomingConf.module_name}] init module error:`, err)
25
- process.exit(2)
26
- }
27
-
28
- return {
29
- ...defaultOptions,
30
- ...incomingConf,
31
- }
32
- })
33
-
34
- const Probe = pmx.probe()
35
- const MODULE_NAME = conf.module_name
36
- const MODULE_ENABLED = parseBool(conf.enabled)
37
- const WORKER_INTERVAL = parseParamToNumber(conf.workerInterval)
38
- const INCLUDE_APPS = parseParamToArray(conf.includeApps)
39
- const EXCLUDE_APPS = parseParamToArray(conf.excludeApps)
40
-
41
- // zombie conf
42
- const ZOMBIE_DETECTION = parseBool(conf.zombieDetection)
43
- const AUTO_RESTART_WHEN_ZOMBIE_DETECTED = parseBool(
44
- conf.autoRestartWhenZombieDetected,
45
- )
46
- const ZOMBIE_MAX_HITS = parseParamToNumber(conf.zombieMaxHits)
47
- const ZOMBIE_MAX_RESTARTS = parseParamToNumber(conf.zombieMaxRestarts)
48
- const zombieProcessDetectionStrategy = conf.zombieProcessDetectionStrategy
49
-
50
- // cpu conf
51
- const cpuOverloadDetection = parseBool(conf.cpuOverloadDetection)
52
- const cpuOverloadThreshold = parseParamToNumber(conf.cpuOverloadThreshold)
53
- const cpuOverloadMaxHits = parseParamToNumber(conf.cpuOverloadMaxHits)
54
- const enablePerfCollection = parseBool(conf.enablePerfCollection)
55
- const perfReportGenerationDir = conf.perfReportGenerationDir
56
- const flamegraphDir = conf.flamegraphDir
57
- const perfSampleDuration = parseParamToNumber(conf.perfSampleDuration)
58
- const perfSampleFrequency = parseParamToNumber(conf.perfSampleFrequency)
59
- const enableNodeInspectorCollection = parseBool(
60
- conf.enableNodeInspectorCollection,
61
- )
62
- const nodeInspectorSampleDuration = parseParamToNumber(
63
- conf.nodeInspectorSampleDuration,
64
- )
65
-
66
- // alert conf
67
- const enableAlert = parseBool(conf.enableAlert)
68
- const alertCmdPath = conf.alertCmdPath
69
- const alertEnv = conf.alertEnv
70
- const alertLevel = conf.alertLevel
71
- const jobHostNameConfPath = conf.jobHostNameConfPath
72
-
73
- // 存储每个进程的 CPU 采样历史(pm_id -> [cpu1, cpu2, ...])
74
- const zombieCpuHistory = new Map()
75
- const zombieRestartHistory = new Map()
76
- const zombieRestartFailedHistory = new Map()
77
-
78
- const cpuOverloadHistory = new Map()
79
- const cpuOverloadRestartHistory = new Map()
80
- const cpuOverloadRestartFailedHistory = new Map()
81
-
82
- let isProcessCheckerRunning = false
83
-
84
- /**
85
- * perf 样本是否采集中
86
- * @type { Map<number,boolean> }
87
- */
88
- const perfSamplingStats = new Map()
89
-
90
- /**
91
- * @param {'log' | 'info' | 'error' | 'warn'} type
92
- *
93
- */
94
- const logger = (type, ...args) => {
95
- return console[type](`[${MODULE_NAME}]`, ...args)
96
- }
97
-
98
- /**
99
- * 判断是否为僵尸进程:最近 ZOMBIE_MAX_HITS 次全是 0%
100
- * @param { number } pid
101
- * @param { number[] } cpus
102
- */
103
- const isZombieProcess = async (pid, cpus) => {
104
- if (zombieProcessDetectionStrategy === 'zombie-state') {
105
- const res = await isZombieStateProcess(pid)
106
-
107
- return res.isZombie
108
- } else if (zombieProcessDetectionStrategy === 'zero-cpu-consecutive') {
109
- return isZombieCpuProcess({
110
- cpus,
111
- maxHits: ZOMBIE_MAX_HITS,
112
- })
113
- } else if (zombieProcessDetectionStrategy === 'zombie-state-and-zero-cpu') {
114
- const { isZombie: zombieState, failed } = await isZombieStateProcess(pid)
115
-
116
- const zombieCpu = isZombieCpuProcess({
117
- cpus,
118
- maxHits: ZOMBIE_MAX_HITS,
119
- })
120
-
121
- // 获取系统进程状态失败时,回退为通过 CPU 持续 0% 判定
122
- if (failed) {
123
- return zombieCpu
124
- }
125
-
126
- return zombieState && zombieCpu
127
- } else if (zombieProcessDetectionStrategy === 'zombie-state-or-zero-cpu') {
128
- const { isZombie: zombieState } = await isZombieStateProcess(pid)
129
-
130
- const zombieCpu = isZombieCpuProcess({
131
- cpus,
132
- maxHits: ZOMBIE_MAX_HITS,
133
- })
134
-
135
- return zombieState || zombieCpu
136
- }
137
-
138
- return isZombieCpuProcess({
139
- cpus,
140
- maxHits: ZOMBIE_MAX_HITS,
141
- })
142
- }
143
-
144
- /**
145
- * @param { number[] } cpus
146
- */
147
- const isCpuOverload = (cpus) => {
148
- return (
149
- cpus.length >= cpuOverloadMaxHits &&
150
- cpus.every((v) => v >= cpuOverloadThreshold)
151
- )
152
- }
153
-
154
- /**
155
- * @param { number } pm_id
156
- * @param { number } appCpuUsage
157
- * @returns { number[] } 对应 pm_id 的 CPU 使用率数组
158
- */
159
- const setZombieCpuHistory = (pm_id, appCpuUsage) => {
160
- if (!zombieCpuHistory.has(pm_id)) {
161
- zombieCpuHistory.set(pm_id, [])
162
- }
163
-
164
- const history = zombieCpuHistory.get(pm_id)
165
-
166
- history.push(appCpuUsage)
167
-
168
- // 只保留最近 ZOMBIE_MAX_HITS 次记录
169
- if (history.length > ZOMBIE_MAX_HITS) {
170
- history.shift()
171
- }
172
-
173
- return history
174
- }
175
-
176
- /**
177
- * @param { number } pm_id
178
- * @param { number } appCpuUsage
179
- * @returns { number[] } 对应 pm_id 的 CPU 使用率数组
180
- */
181
- const setCpuOverloadHistory = (pm_id, appCpuUsage) => {
182
- if (!cpuOverloadHistory.has(pm_id)) {
183
- cpuOverloadHistory.set(pm_id, [])
184
- }
185
-
186
- const history = cpuOverloadHistory.get(pm_id)
187
-
188
- history.push(appCpuUsage)
189
-
190
- // 只保留最近 x 次记录
191
- if (history.length > cpuOverloadMaxHits) {
192
- history.shift()
193
- }
194
-
195
- return history
196
- }
197
-
198
- const setZombieRestartFailedHistory = (pm_id) => {
199
- if (!zombieRestartFailedHistory.has(pm_id)) {
200
- zombieRestartFailedHistory.set(pm_id, 1)
201
- } else {
202
- zombieRestartFailedHistory.set(
203
- pm_id,
204
- zombieRestartFailedHistory.get(pm_id) + 1,
205
- )
206
- }
207
- }
208
-
209
- const setCpuOverloadRestartFailedHistory = (pm_id) => {
210
- if (!cpuOverloadRestartFailedHistory.has(pm_id)) {
211
- cpuOverloadRestartFailedHistory.set(pm_id, 1)
212
- } else {
213
- cpuOverloadRestartFailedHistory.set(
214
- pm_id,
215
- cpuOverloadRestartFailedHistory.get(pm_id) + 1,
216
- )
217
- }
218
- }
219
-
220
- /**
221
- * 发送重启警告
222
- * @param {string} title
223
- * @param {string} message
224
- */
225
- const sendRestartAlert = async (title, message) => {
226
- if (!enableAlert) return
227
-
228
- const datetime = new Date().toLocaleString()
229
- const jobHostName = getJobConfHostName(jobHostNameConfPath)
230
-
231
- return await sendAlert({
232
- cmd: alertCmdPath,
233
- env: alertEnv,
234
- level: alertLevel,
235
- title: `[${MODULE_NAME}] Alert: ${title}`,
236
- content: `[${jobHostName}] [${datetime}] - ${message}`,
237
- })
238
- }
239
-
240
- /**
241
- * check process
242
- */
243
- const processChecker = async () => {
244
- if (isProcessCheckerRunning) return
245
-
246
- try {
247
- isProcessCheckerRunning = true
248
-
249
- const apps = await listAppsAsync()
250
-
251
- for (const app of apps) {
252
- const { name, pid, pm_id, monit, pm2_env } = app
253
-
254
- const appStatus = pm2_env?.status
255
-
256
- // 非目标应用,跳过
257
- if (
258
- MODULE_NAME === name ||
259
- (INCLUDE_APPS.length > 0 && !INCLUDE_APPS.includes(name)) ||
260
- (EXCLUDE_APPS.length > 0 && EXCLUDE_APPS.includes(name))
261
- ) {
262
- continue
263
- }
264
-
265
- // 只处理 online 状态的进程
266
- if (appStatus !== 'online') {
267
- // 进程不在 online 状态时,清空其历史记录,避免干扰
268
- zombieCpuHistory.delete(pm_id)
269
- cpuOverloadHistory.delete(pm_id)
270
-
271
- continue
272
- }
273
-
274
- const pm2CpuUsage = monit?.cpu
275
- const sysCpuUsage = await getSysCpuUsageByPid(pid)
276
- const appCpuUsage =
277
- typeof sysCpuUsage === 'number' ? sysCpuUsage : pm2CpuUsage
278
-
279
- const cpuHistory = setZombieCpuHistory(pm_id, appCpuUsage)
280
- const cpuHistory2 = setCpuOverloadHistory(pm_id, appCpuUsage)
281
-
282
- // 发送消息通知对应应用进程,采样 CPU 性能
283
- if (enableNodeInspectorCollection) {
284
- if (appCpuUsage >= cpuOverloadThreshold) {
285
- await sendMessage(pm_id, 'cpu-profile-start')
286
- await sleepAsync(nodeInspectorSampleDuration * 1000)
287
- await sendMessage(pm_id, 'cpu-profile-stop')
288
- }
289
- }
290
-
291
- const zombieDetected = await isZombieProcess(pid, cpuHistory)
292
-
293
- // 判断是否为僵尸:最近 ZOMBIE_MAX_HITS 次全是 0%
294
- if (ZOMBIE_DETECTION && zombieDetected) {
295
- logger(
296
- 'info',
297
- `Zombie detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
298
- )
299
-
300
- if (AUTO_RESTART_WHEN_ZOMBIE_DETECTED) {
301
- if (
302
- ZOMBIE_MAX_RESTARTS > 0 &&
303
- zombieRestartHistory.get(pm_id) >= ZOMBIE_MAX_RESTARTS
304
- ) {
305
- continue
306
- }
307
-
308
- logger('info', 'restarting...')
309
-
310
- try {
311
- await restartAppAsync(pm_id)
312
-
313
- if (!zombieRestartHistory.has(pm_id)) {
314
- zombieRestartHistory.set(pm_id, 1)
315
- } else {
316
- const history = zombieRestartHistory.get(pm_id)
317
-
318
- zombieRestartHistory.set(pm_id, history + 1)
319
- }
320
-
321
- logger(
322
- 'info',
323
- `[ZOMBIE] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${zombieRestartHistory.get(pm_id)} times`,
324
- )
325
-
326
- await sendRestartAlert(
327
- `The zombie process has been restarted!`,
328
- `appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${zombieRestartHistory.get(pm_id)} times`,
329
- )
330
-
331
- // 重启后清除该进程的历史记录,避免刚重启又被判定为僵尸
332
- zombieCpuHistory.delete(pm_id)
333
- } catch (restartErr) {
334
- logger(
335
- 'error',
336
- `[ZOMBIE] Restart failed for ${name} (pm_id: ${pm_id}):`,
337
- restartErr,
338
- )
339
-
340
- setZombieRestartFailedHistory(pm_id)
341
- }
342
- }
343
- }
344
- // CPU 是否持续过载
345
- else if (cpuOverloadDetection && isCpuOverload(cpuHistory2)) {
346
- logger(
347
- 'info',
348
- `CPU Overload detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
349
- )
350
-
351
- if (enablePerfCollection && !perfSamplingStats.get(pm_id)) {
352
- perfSamplingStats.set(pm_id, true)
353
-
354
- await performPerfSampling({
355
- pid,
356
- moduleName: MODULE_NAME,
357
- perfDir: perfReportGenerationDir,
358
- flamegraphDir,
359
- sampleDuration: perfSampleDuration,
360
- sampleFrequency: perfSampleFrequency,
361
- })
362
-
363
- perfSamplingStats.delete(pm_id)
364
- }
365
-
366
- try {
367
- logger('info', 'restarting...')
368
-
369
- await restartAppAsync(pm_id)
370
-
371
- if (!cpuOverloadRestartHistory.has(pm_id)) {
372
- cpuOverloadRestartHistory.set(pm_id, 1)
373
- } else {
374
- cpuOverloadRestartHistory.set(
375
- pm_id,
376
- cpuOverloadRestartHistory.get(pm_id) + 1,
377
- )
378
- }
379
-
380
- logger(
381
- 'info',
382
- `[CPU OVERLOAD] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${cpuOverloadRestartHistory.get(pm_id)} times`,
383
- )
384
-
385
- await sendRestartAlert(
386
- `CPU overload process restarted!`,
387
- `appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${cpuOverloadRestartHistory.get(pm_id)} times`,
388
- )
389
-
390
- cpuOverloadHistory.delete(pm_id)
391
- } catch (restartErr) {
392
- logger(
393
- 'error',
394
- `[CPU OVERLOAD] Restart failed for ${name} (pm_id: ${pm_id}):`,
395
- restartErr,
396
- )
397
-
398
- setCpuOverloadRestartFailedHistory(pm_id)
399
- }
400
- }
401
- }
402
- } catch (err) {
403
- logger('error', err)
404
- } finally {
405
- isProcessCheckerRunning = false
406
- }
407
- }
408
-
409
- const runModule = () => {
410
- if (!MODULE_ENABLED) return
411
-
412
- // connect to local pm2
413
- pm2.connect((err) => {
414
- if (err) {
415
- logger('error', `PM2 connection error:`, err)
416
-
417
- process.exit(1)
418
- }
419
-
420
- logger('info', 'Connected to PM2, starting monitor...')
421
-
422
- processChecker()
423
-
424
- setInterval(() => {
425
- processChecker()
426
- }, WORKER_INTERVAL)
427
- })
428
-
429
- /** PROB PMX **/
430
- Probe.metric({
431
- name: 'Zombie Restarts',
432
- value: () => {
433
- const res = []
434
-
435
- for (const [k, v] of zombieRestartHistory) {
436
- if (v > 0) {
437
- res.push([k, v])
438
- }
439
- }
440
-
441
- if (!res.length) return 'N/A'
442
-
443
- return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
444
- },
445
- })
446
-
447
- Probe.metric({
448
- name: 'Zombie Restarts (failed)',
449
- value: () => {
450
- const res = []
451
-
452
- for (const [k, v] of zombieRestartFailedHistory) {
453
- if (v > 0) {
454
- res.push([k, v])
455
- }
456
- }
457
-
458
- if (!res.length) return 'N/A'
459
-
460
- return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
461
- },
462
- })
463
-
464
- Probe.metric({
465
- name: 'CPU Overload Restarts',
466
- value: () => {
467
- const res = []
468
-
469
- for (const [k, v] of cpuOverloadRestartHistory) {
470
- if (v > 0) {
471
- res.push([k, v])
472
- }
473
- }
474
-
475
- if (!res.length) return 'N/A'
476
-
477
- return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
478
- },
479
- })
480
-
481
- Probe.metric({
482
- name: 'CPU Overload Restarts (failed)',
483
- value: () => {
484
- const res = []
485
-
486
- for (const [k, v] of cpuOverloadRestartFailedHistory) {
487
- if (v > 0) {
488
- res.push([k, v])
489
- }
490
- }
491
-
492
- if (!res.length) return 'N/A'
493
-
494
- return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
495
- },
496
- })
497
-
498
- Probe.metric({
499
- name: 'Processes in Sampling (perf)',
500
- value: () => {
501
- const res = []
502
-
503
- for (const [k, v] of perfSamplingStats) {
504
- if (v === true) {
505
- res.push(k)
506
- }
507
- }
508
-
509
- if (!res.length) return 'N/A'
510
-
511
- return res.join(', ')
512
- },
513
- })
514
- }
515
-
516
- runModule()
1
+ const pmx = require('pmx')
2
+ const pm2 = require('pm2')
3
+
4
+ const { listAppsAsync, restartAppAsync } = require('./pm2-extra')
5
+ const {
6
+ parseParamToArray,
7
+ parseParamToNumber,
8
+ parseBool,
9
+ safeToFixed,
10
+ sleepAsync,
11
+ getSysCpuUsageByPid,
12
+ } = require('./utils')
13
+ const { defaultOptions } = require('./defaults')
14
+ const { sendMessage } = require('./message')
15
+ const { performPerfSampling } = require('./perf-sampler')
16
+ const { sendAlert } = require('./alert')
17
+ const { getJobConfHostName } = require('./job-conf')
18
+ const { isZombieCpuProcess, isZombieStateProcess } = require('./zombie-check')
19
+
20
+ /**
21
+ * @type { defaultOptions }
22
+ */
23
+ const conf = pmx.initModule({}, (err, incomingConf) => {
24
+ if (err) {
25
+ console.error(`[${incomingConf.module_name}] init module error:`, err)
26
+ process.exit(2)
27
+ }
28
+
29
+ return {
30
+ ...defaultOptions,
31
+ ...incomingConf,
32
+ }
33
+ })
34
+
35
+ const Probe = pmx.probe()
36
+ const MODULE_NAME = conf.module_name
37
+ const MODULE_ENABLED = parseBool(conf.enabled)
38
+ const WORKER_INTERVAL = parseParamToNumber(conf.workerInterval)
39
+ const INCLUDE_APPS = parseParamToArray(conf.includeApps)
40
+ const EXCLUDE_APPS = parseParamToArray(conf.excludeApps)
41
+
42
+ // zombie conf
43
+ const ZOMBIE_DETECTION = parseBool(conf.zombieDetection)
44
+ const AUTO_RESTART_WHEN_ZOMBIE_DETECTED = parseBool(
45
+ conf.autoRestartWhenZombieDetected,
46
+ )
47
+ const ZOMBIE_MAX_HITS = parseParamToNumber(conf.zombieMaxHits)
48
+ const ZOMBIE_MAX_RESTARTS = parseParamToNumber(conf.zombieMaxRestarts)
49
+ const zombieProcessDetectionStrategy = conf.zombieProcessDetectionStrategy
50
+
51
+ // cpu conf
52
+ const cpuOverloadDetection = parseBool(conf.cpuOverloadDetection)
53
+ const cpuOverloadThreshold = parseParamToNumber(conf.cpuOverloadThreshold)
54
+ const cpuOverloadMaxHits = parseParamToNumber(conf.cpuOverloadMaxHits)
55
+ const enablePerfCollection = parseBool(conf.enablePerfCollection)
56
+ const perfReportGenerationDir = conf.perfReportGenerationDir
57
+ const flamegraphDir = conf.flamegraphDir
58
+ const perfSampleDuration = parseParamToNumber(conf.perfSampleDuration)
59
+ const perfSampleFrequency = parseParamToNumber(conf.perfSampleFrequency)
60
+ const enableNodeInspectorCollection = parseBool(
61
+ conf.enableNodeInspectorCollection,
62
+ )
63
+ const nodeInspectorSampleDuration = parseParamToNumber(
64
+ conf.nodeInspectorSampleDuration,
65
+ )
66
+
67
+ // alert conf
68
+ const enableAlert = parseBool(conf.enableAlert)
69
+ const alertCmdPath = conf.alertCmdPath
70
+ const alertEnv = conf.alertEnv
71
+ const alertLevel = conf.alertLevel
72
+ const jobHostNameConfPath = conf.jobHostNameConfPath
73
+
74
+ // 存储每个进程的 CPU 采样历史(pm_id -> [cpu1, cpu2, ...])
75
+ const zombieCpuHistory = new Map()
76
+ const zombieRestartHistory = new Map()
77
+ const zombieRestartFailedHistory = new Map()
78
+
79
+ const cpuOverloadHistory = new Map()
80
+ const cpuOverloadRestartHistory = new Map()
81
+ const cpuOverloadRestartFailedHistory = new Map()
82
+
83
+ /**
84
+ * @type { Map<number, number> }
85
+ */
86
+ const cpuUsageFromPM2 = new Map()
87
+ /**
88
+ * @type { Map<number, number> }
89
+ */
90
+ const cpuUsageFromSys = new Map()
91
+
92
+ let isProcessCheckerRunning = false
93
+
94
+ /**
95
+ * perf 样本是否采集中
96
+ * @type { Map<number,boolean> }
97
+ */
98
+ const perfSamplingStats = new Map()
99
+
100
+ /**
101
+ * @param {'log' | 'info' | 'error' | 'warn'} type
102
+ *
103
+ */
104
+ const logger = (type, ...args) => {
105
+ return console[type](`[${MODULE_NAME}]`, ...args)
106
+ }
107
+
108
+ /**
109
+ * 判断是否为僵尸进程:最近 ZOMBIE_MAX_HITS 次全是 0%
110
+ * @param { number } pid
111
+ * @param { number[] } cpus
112
+ */
113
+ const isZombieProcess = async (pid, cpus) => {
114
+ if (zombieProcessDetectionStrategy === 'zombie-state') {
115
+ const res = await isZombieStateProcess(pid)
116
+
117
+ return res.isZombie
118
+ } else if (zombieProcessDetectionStrategy === 'zero-cpu-consecutive') {
119
+ return isZombieCpuProcess({
120
+ cpus,
121
+ maxHits: ZOMBIE_MAX_HITS,
122
+ })
123
+ } else if (zombieProcessDetectionStrategy === 'zombie-state-and-zero-cpu') {
124
+ const { isZombie: zombieState, failed } = await isZombieStateProcess(pid)
125
+
126
+ const zombieCpu = isZombieCpuProcess({
127
+ cpus,
128
+ maxHits: ZOMBIE_MAX_HITS,
129
+ })
130
+
131
+ // 获取系统进程状态失败时,回退为通过 CPU 持续 0% 判定
132
+ if (failed) {
133
+ return zombieCpu
134
+ }
135
+
136
+ return zombieState && zombieCpu
137
+ } else if (zombieProcessDetectionStrategy === 'zombie-state-or-zero-cpu') {
138
+ const { isZombie: zombieState } = await isZombieStateProcess(pid)
139
+
140
+ const zombieCpu = isZombieCpuProcess({
141
+ cpus,
142
+ maxHits: ZOMBIE_MAX_HITS,
143
+ })
144
+
145
+ return zombieState || zombieCpu
146
+ }
147
+
148
+ return isZombieCpuProcess({
149
+ cpus,
150
+ maxHits: ZOMBIE_MAX_HITS,
151
+ })
152
+ }
153
+
154
+ /**
155
+ * @param { number[] } cpus
156
+ */
157
+ const isCpuOverload = (cpus) => {
158
+ return (
159
+ cpus.length >= cpuOverloadMaxHits &&
160
+ cpus.every((v) => v >= cpuOverloadThreshold)
161
+ )
162
+ }
163
+
164
+ /**
165
+ * @param { number } pm_id
166
+ * @param { number } appCpuUsage
167
+ * @returns { number[] } 对应 pm_id 的 CPU 使用率数组
168
+ */
169
+ const setZombieCpuHistory = (pm_id, appCpuUsage) => {
170
+ if (!zombieCpuHistory.has(pm_id)) {
171
+ zombieCpuHistory.set(pm_id, [])
172
+ }
173
+
174
+ const history = zombieCpuHistory.get(pm_id)
175
+
176
+ history.push(appCpuUsage)
177
+
178
+ // 只保留最近 ZOMBIE_MAX_HITS 次记录
179
+ if (history.length > ZOMBIE_MAX_HITS) {
180
+ history.shift()
181
+ }
182
+
183
+ return history
184
+ }
185
+
186
+ /**
187
+ * @param { number } pm_id
188
+ * @param { number } appCpuUsage
189
+ * @returns { number[] } 对应 pm_id 的 CPU 使用率数组
190
+ */
191
+ const setCpuOverloadHistory = (pm_id, appCpuUsage) => {
192
+ if (!cpuOverloadHistory.has(pm_id)) {
193
+ cpuOverloadHistory.set(pm_id, [])
194
+ }
195
+
196
+ const history = cpuOverloadHistory.get(pm_id)
197
+
198
+ history.push(appCpuUsage)
199
+
200
+ // 只保留最近 x 次记录
201
+ if (history.length > cpuOverloadMaxHits) {
202
+ history.shift()
203
+ }
204
+
205
+ return history
206
+ }
207
+
208
+ const setZombieRestartFailedHistory = (pm_id) => {
209
+ if (!zombieRestartFailedHistory.has(pm_id)) {
210
+ zombieRestartFailedHistory.set(pm_id, 1)
211
+ } else {
212
+ zombieRestartFailedHistory.set(
213
+ pm_id,
214
+ zombieRestartFailedHistory.get(pm_id) + 1,
215
+ )
216
+ }
217
+ }
218
+
219
+ const setCpuOverloadRestartFailedHistory = (pm_id) => {
220
+ if (!cpuOverloadRestartFailedHistory.has(pm_id)) {
221
+ cpuOverloadRestartFailedHistory.set(pm_id, 1)
222
+ } else {
223
+ cpuOverloadRestartFailedHistory.set(
224
+ pm_id,
225
+ cpuOverloadRestartFailedHistory.get(pm_id) + 1,
226
+ )
227
+ }
228
+ }
229
+
230
+ /**
231
+ * 发送重启警告
232
+ * @param {string} title
233
+ * @param {string} message
234
+ */
235
+ const sendRestartAlert = async (title, message) => {
236
+ if (!enableAlert) return
237
+
238
+ const datetime = new Date().toLocaleString()
239
+ const jobHostName = getJobConfHostName(jobHostNameConfPath)
240
+
241
+ return await sendAlert({
242
+ cmd: alertCmdPath,
243
+ env: alertEnv,
244
+ level: alertLevel,
245
+ title: `[${MODULE_NAME}] Alert: ${title}`,
246
+ content: `[${jobHostName}] [${datetime}] - ${message}`,
247
+ })
248
+ }
249
+
250
+ /**
251
+ * check process
252
+ */
253
+ const processChecker = async () => {
254
+ if (isProcessCheckerRunning) return
255
+
256
+ try {
257
+ isProcessCheckerRunning = true
258
+
259
+ const apps = await listAppsAsync()
260
+
261
+ cpuUsageFromPM2.clear()
262
+ cpuUsageFromSys.clear()
263
+
264
+ const cpus = apps.map((v) => v.monit?.cpu)
265
+
266
+ const hasAbnormalMonitoringData = cpus.every((v) => typeof v !== 'number')
267
+
268
+ if (hasAbnormalMonitoringData) {
269
+ const details = apps.map((v) => {
270
+ const { name, pid, pm_id, monit } = v
271
+
272
+ return {
273
+ name,
274
+ pid,
275
+ pm_id,
276
+ monit,
277
+ }
278
+ })
279
+
280
+ logger('warn', 'Abnormal monitoring data exists: ', details)
281
+ }
282
+
283
+ for (const app of apps) {
284
+ const { name, pid, pm_id, monit, pm2_env } = app
285
+
286
+ const appStatus = pm2_env?.status
287
+
288
+ // 非目标应用,跳过
289
+ if (
290
+ MODULE_NAME === name ||
291
+ (INCLUDE_APPS.length > 0 && !INCLUDE_APPS.includes(name)) ||
292
+ (EXCLUDE_APPS.length > 0 && EXCLUDE_APPS.includes(name))
293
+ ) {
294
+ continue
295
+ }
296
+
297
+ // 只处理 online 状态的进程
298
+ if (appStatus !== 'online') {
299
+ // 进程不在 online 状态时,清空其历史记录,避免干扰
300
+ zombieCpuHistory.delete(pm_id)
301
+ cpuOverloadHistory.delete(pm_id)
302
+
303
+ continue
304
+ }
305
+
306
+ const pm2CpuUsage = monit?.cpu
307
+ const sysCpuUsage = await getSysCpuUsageByPid(pid)
308
+ const appCpuUsage =
309
+ typeof sysCpuUsage === 'number' ? sysCpuUsage : pm2CpuUsage
310
+
311
+ cpuUsageFromPM2.set(pm_id, pm2CpuUsage)
312
+ cpuUsageFromSys.set(pm_id, sysCpuUsage)
313
+
314
+ const cpuHistory = setZombieCpuHistory(pm_id, appCpuUsage)
315
+ const cpuHistory2 = setCpuOverloadHistory(pm_id, appCpuUsage)
316
+
317
+ // 发送消息通知对应应用进程,采样 CPU 性能
318
+ if (enableNodeInspectorCollection) {
319
+ if (appCpuUsage >= cpuOverloadThreshold) {
320
+ await sendMessage(pm_id, 'cpu-profile-start')
321
+ await sleepAsync(nodeInspectorSampleDuration * 1000)
322
+ await sendMessage(pm_id, 'cpu-profile-stop')
323
+ }
324
+ }
325
+
326
+ const zombieDetected = await isZombieProcess(pid, cpuHistory)
327
+
328
+ // 判断是否为僵尸:最近 ZOMBIE_MAX_HITS 次全是 0%
329
+ if (ZOMBIE_DETECTION && zombieDetected) {
330
+ logger(
331
+ 'info',
332
+ `Zombie detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
333
+ )
334
+
335
+ if (AUTO_RESTART_WHEN_ZOMBIE_DETECTED) {
336
+ if (
337
+ ZOMBIE_MAX_RESTARTS > 0 &&
338
+ zombieRestartHistory.get(pm_id) >= ZOMBIE_MAX_RESTARTS
339
+ ) {
340
+ continue
341
+ }
342
+
343
+ logger('info', 'restarting...')
344
+
345
+ try {
346
+ await restartAppAsync(pm_id)
347
+
348
+ if (!zombieRestartHistory.has(pm_id)) {
349
+ zombieRestartHistory.set(pm_id, 1)
350
+ } else {
351
+ const history = zombieRestartHistory.get(pm_id)
352
+
353
+ zombieRestartHistory.set(pm_id, history + 1)
354
+ }
355
+
356
+ logger(
357
+ 'info',
358
+ `[ZOMBIE] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${zombieRestartHistory.get(pm_id)} times`,
359
+ )
360
+
361
+ await sendRestartAlert(
362
+ `The zombie process has been restarted!`,
363
+ `appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${zombieRestartHistory.get(pm_id)} times`,
364
+ )
365
+
366
+ // 重启后清除该进程的历史记录,避免刚重启又被判定为僵尸
367
+ zombieCpuHistory.delete(pm_id)
368
+ } catch (restartErr) {
369
+ logger(
370
+ 'error',
371
+ `[ZOMBIE] Restart failed for ${name} (pm_id: ${pm_id}):`,
372
+ restartErr,
373
+ )
374
+
375
+ setZombieRestartFailedHistory(pm_id)
376
+ }
377
+ }
378
+ }
379
+ // CPU 是否持续过载
380
+ else if (cpuOverloadDetection && isCpuOverload(cpuHistory2)) {
381
+ logger(
382
+ 'info',
383
+ `CPU Overload detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
384
+ )
385
+
386
+ if (enablePerfCollection && !perfSamplingStats.get(pm_id)) {
387
+ perfSamplingStats.set(pm_id, true)
388
+
389
+ await performPerfSampling({
390
+ pid,
391
+ moduleName: MODULE_NAME,
392
+ perfDir: perfReportGenerationDir,
393
+ flamegraphDir,
394
+ sampleDuration: perfSampleDuration,
395
+ sampleFrequency: perfSampleFrequency,
396
+ })
397
+
398
+ perfSamplingStats.delete(pm_id)
399
+ }
400
+
401
+ try {
402
+ logger('info', 'restarting...')
403
+
404
+ await restartAppAsync(pm_id)
405
+
406
+ if (!cpuOverloadRestartHistory.has(pm_id)) {
407
+ cpuOverloadRestartHistory.set(pm_id, 1)
408
+ } else {
409
+ cpuOverloadRestartHistory.set(
410
+ pm_id,
411
+ cpuOverloadRestartHistory.get(pm_id) + 1,
412
+ )
413
+ }
414
+
415
+ logger(
416
+ 'info',
417
+ `[CPU OVERLOAD] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${cpuOverloadRestartHistory.get(pm_id)} times`,
418
+ )
419
+
420
+ await sendRestartAlert(
421
+ `CPU overload process restarted!`,
422
+ `appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${cpuOverloadRestartHistory.get(pm_id)} times`,
423
+ )
424
+
425
+ cpuOverloadHistory.delete(pm_id)
426
+ } catch (restartErr) {
427
+ logger(
428
+ 'error',
429
+ `[CPU OVERLOAD] Restart failed for ${name} (pm_id: ${pm_id}):`,
430
+ restartErr,
431
+ )
432
+
433
+ setCpuOverloadRestartFailedHistory(pm_id)
434
+ }
435
+ }
436
+ }
437
+ } catch (err) {
438
+ logger('error', err)
439
+ } finally {
440
+ isProcessCheckerRunning = false
441
+ }
442
+ }
443
+
444
+ const runModule = () => {
445
+ if (!MODULE_ENABLED) return
446
+
447
+ // connect to local pm2
448
+ pm2.connect((err) => {
449
+ if (err) {
450
+ logger('error', `PM2 connection error:`, err)
451
+
452
+ process.exit(1)
453
+ }
454
+
455
+ logger('info', 'Connected to PM2, starting monitor...')
456
+
457
+ processChecker()
458
+
459
+ setInterval(() => {
460
+ processChecker()
461
+ }, WORKER_INTERVAL)
462
+ })
463
+
464
+ /** PROB PMX **/
465
+ Probe.metric({
466
+ name: 'Zombie Restarts',
467
+ value: () => {
468
+ const res = []
469
+
470
+ for (const [k, v] of zombieRestartHistory) {
471
+ if (v > 0) {
472
+ res.push([k, v])
473
+ }
474
+ }
475
+
476
+ if (!res.length) return 'N/A'
477
+
478
+ return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
479
+ },
480
+ })
481
+
482
+ Probe.metric({
483
+ name: 'Zombie Restarts (failed)',
484
+ value: () => {
485
+ const res = []
486
+
487
+ for (const [k, v] of zombieRestartFailedHistory) {
488
+ if (v > 0) {
489
+ res.push([k, v])
490
+ }
491
+ }
492
+
493
+ if (!res.length) return 'N/A'
494
+
495
+ return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
496
+ },
497
+ })
498
+
499
+ Probe.metric({
500
+ name: 'CPU Overload Restarts',
501
+ value: () => {
502
+ const res = []
503
+
504
+ for (const [k, v] of cpuOverloadRestartHistory) {
505
+ if (v > 0) {
506
+ res.push([k, v])
507
+ }
508
+ }
509
+
510
+ if (!res.length) return 'N/A'
511
+
512
+ return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
513
+ },
514
+ })
515
+
516
+ Probe.metric({
517
+ name: 'CPU Overload Restarts (failed)',
518
+ value: () => {
519
+ const res = []
520
+
521
+ for (const [k, v] of cpuOverloadRestartFailedHistory) {
522
+ if (v > 0) {
523
+ res.push([k, v])
524
+ }
525
+ }
526
+
527
+ if (!res.length) return 'N/A'
528
+
529
+ return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
530
+ },
531
+ })
532
+
533
+ Probe.metric({
534
+ name: 'Processes in Sampling (perf)',
535
+ value: () => {
536
+ const res = []
537
+
538
+ for (const [k, v] of perfSamplingStats) {
539
+ if (v === true) {
540
+ res.push(k)
541
+ }
542
+ }
543
+
544
+ if (!res.length) return 'N/A'
545
+
546
+ return res.join(', ')
547
+ },
548
+ })
549
+
550
+ Probe.metric({
551
+ name: 'CPU (pm2)',
552
+ value: () => {
553
+ const res = []
554
+
555
+ for (const [k, v] of cpuUsageFromPM2) {
556
+ res.push([k, v])
557
+ }
558
+
559
+ if (!res.length) return 'N/A'
560
+
561
+ return res.map((v) => `[${v[0]}]:${v[1]}%`).join(';')
562
+ },
563
+ })
564
+
565
+ Probe.metric({
566
+ name: 'CPU (sys)',
567
+ value: () => {
568
+ const res = []
569
+
570
+ for (const [k, v] of cpuUsageFromSys) {
571
+ res.push([k, v])
572
+ }
573
+
574
+ if (!res.length) return 'N/A'
575
+
576
+ return res.map((v) => `[${v[0]}]:${v[1]}%`).join(';')
577
+ },
578
+ })
579
+ }
580
+
581
+ runModule()