pm2-perfmonitor 2.6.1 → 2.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/app.js CHANGED
@@ -1,581 +1,581 @@
1
- const pmx = require('pmx')
2
- const pm2 = require('pm2')
3
-
4
- const { listAppsAsync, restartAppAsync } = require('./pm2-extra')
5
- const {
6
- parseParamToArray,
7
- parseParamToNumber,
8
- parseBool,
9
- safeToFixed,
10
- sleepAsync,
11
- getSysCpuUsageByPid,
12
- } = require('./utils')
13
- const { defaultOptions } = require('./defaults')
14
- const { sendMessage } = require('./message')
15
- const { performPerfSampling } = require('./perf-sampler')
16
- const { sendAlert } = require('./alert')
17
- const { getJobConfHostName } = require('./job-conf')
18
- const { isZombieCpuProcess, isZombieStateProcess } = require('./zombie-check')
19
-
20
- /**
21
- * @type { defaultOptions }
22
- */
23
- const conf = pmx.initModule({}, (err, incomingConf) => {
24
- if (err) {
25
- console.error(`[${incomingConf.module_name}] init module error:`, err)
26
- process.exit(2)
27
- }
28
-
29
- return {
30
- ...defaultOptions,
31
- ...incomingConf,
32
- }
33
- })
34
-
35
- const Probe = pmx.probe()
36
- const MODULE_NAME = conf.module_name
37
- const MODULE_ENABLED = parseBool(conf.enabled)
38
- const WORKER_INTERVAL = parseParamToNumber(conf.workerInterval)
39
- const INCLUDE_APPS = parseParamToArray(conf.includeApps)
40
- const EXCLUDE_APPS = parseParamToArray(conf.excludeApps)
41
-
42
- // zombie conf
43
- const ZOMBIE_DETECTION = parseBool(conf.zombieDetection)
44
- const AUTO_RESTART_WHEN_ZOMBIE_DETECTED = parseBool(
45
- conf.autoRestartWhenZombieDetected,
46
- )
47
- const ZOMBIE_MAX_HITS = parseParamToNumber(conf.zombieMaxHits)
48
- const ZOMBIE_MAX_RESTARTS = parseParamToNumber(conf.zombieMaxRestarts)
49
- const zombieProcessDetectionStrategy = conf.zombieProcessDetectionStrategy
50
-
51
- // cpu conf
52
- const cpuOverloadDetection = parseBool(conf.cpuOverloadDetection)
53
- const cpuOverloadThreshold = parseParamToNumber(conf.cpuOverloadThreshold)
54
- const cpuOverloadMaxHits = parseParamToNumber(conf.cpuOverloadMaxHits)
55
- const enablePerfCollection = parseBool(conf.enablePerfCollection)
56
- const perfReportGenerationDir = conf.perfReportGenerationDir
57
- const flamegraphDir = conf.flamegraphDir
58
- const perfSampleDuration = parseParamToNumber(conf.perfSampleDuration)
59
- const perfSampleFrequency = parseParamToNumber(conf.perfSampleFrequency)
60
- const enableNodeInspectorCollection = parseBool(
61
- conf.enableNodeInspectorCollection,
62
- )
63
- const nodeInspectorSampleDuration = parseParamToNumber(
64
- conf.nodeInspectorSampleDuration,
65
- )
66
-
67
- // alert conf
68
- const enableAlert = parseBool(conf.enableAlert)
69
- const alertCmdPath = conf.alertCmdPath
70
- const alertEnv = conf.alertEnv
71
- const alertLevel = conf.alertLevel
72
- const jobHostNameConfPath = conf.jobHostNameConfPath
73
-
74
- // 存储每个进程的 CPU 采样历史(pm_id -> [cpu1, cpu2, ...])
75
- const zombieCpuHistory = new Map()
76
- const zombieRestartHistory = new Map()
77
- const zombieRestartFailedHistory = new Map()
78
-
79
- const cpuOverloadHistory = new Map()
80
- const cpuOverloadRestartHistory = new Map()
81
- const cpuOverloadRestartFailedHistory = new Map()
82
-
83
- /**
84
- * @type { Map<number, number> }
85
- */
86
- const cpuUsageFromPM2 = new Map()
87
- /**
88
- * @type { Map<number, number> }
89
- */
90
- const cpuUsageFromSys = new Map()
91
-
92
- let isProcessCheckerRunning = false
93
-
94
- /**
95
- * perf 样本是否采集中
96
- * @type { Map<number,boolean> }
97
- */
98
- const perfSamplingStats = new Map()
99
-
100
- /**
101
- * @param {'log' | 'info' | 'error' | 'warn'} type
102
- *
103
- */
104
- const logger = (type, ...args) => {
105
- return console[type](`[${MODULE_NAME}]`, ...args)
106
- }
107
-
108
- /**
109
- * 判断是否为僵尸进程:最近 ZOMBIE_MAX_HITS 次全是 0%
110
- * @param { number } pid
111
- * @param { number[] } cpus
112
- */
113
- const isZombieProcess = async (pid, cpus) => {
114
- if (zombieProcessDetectionStrategy === 'zombie-state') {
115
- const res = await isZombieStateProcess(pid)
116
-
117
- return res.isZombie
118
- } else if (zombieProcessDetectionStrategy === 'zero-cpu-consecutive') {
119
- return isZombieCpuProcess({
120
- cpus,
121
- maxHits: ZOMBIE_MAX_HITS,
122
- })
123
- } else if (zombieProcessDetectionStrategy === 'zombie-state-and-zero-cpu') {
124
- const { isZombie: zombieState, failed } = await isZombieStateProcess(pid)
125
-
126
- const zombieCpu = isZombieCpuProcess({
127
- cpus,
128
- maxHits: ZOMBIE_MAX_HITS,
129
- })
130
-
131
- // 获取系统进程状态失败时,回退为通过 CPU 持续 0% 判定
132
- if (failed) {
133
- return zombieCpu
134
- }
135
-
136
- return zombieState && zombieCpu
137
- } else if (zombieProcessDetectionStrategy === 'zombie-state-or-zero-cpu') {
138
- const { isZombie: zombieState } = await isZombieStateProcess(pid)
139
-
140
- const zombieCpu = isZombieCpuProcess({
141
- cpus,
142
- maxHits: ZOMBIE_MAX_HITS,
143
- })
144
-
145
- return zombieState || zombieCpu
146
- }
147
-
148
- return isZombieCpuProcess({
149
- cpus,
150
- maxHits: ZOMBIE_MAX_HITS,
151
- })
152
- }
153
-
154
- /**
155
- * @param { number[] } cpus
156
- */
157
- const isCpuOverload = (cpus) => {
158
- return (
159
- cpus.length >= cpuOverloadMaxHits &&
160
- cpus.every((v) => v >= cpuOverloadThreshold)
161
- )
162
- }
163
-
164
- /**
165
- * @param { number } pm_id
166
- * @param { number } appCpuUsage
167
- * @returns { number[] } 对应 pm_id 的 CPU 使用率数组
168
- */
169
- const setZombieCpuHistory = (pm_id, appCpuUsage) => {
170
- if (!zombieCpuHistory.has(pm_id)) {
171
- zombieCpuHistory.set(pm_id, [])
172
- }
173
-
174
- const history = zombieCpuHistory.get(pm_id)
175
-
176
- history.push(appCpuUsage)
177
-
178
- // 只保留最近 ZOMBIE_MAX_HITS 次记录
179
- if (history.length > ZOMBIE_MAX_HITS) {
180
- history.shift()
181
- }
182
-
183
- return history
184
- }
185
-
186
- /**
187
- * @param { number } pm_id
188
- * @param { number } appCpuUsage
189
- * @returns { number[] } 对应 pm_id 的 CPU 使用率数组
190
- */
191
- const setCpuOverloadHistory = (pm_id, appCpuUsage) => {
192
- if (!cpuOverloadHistory.has(pm_id)) {
193
- cpuOverloadHistory.set(pm_id, [])
194
- }
195
-
196
- const history = cpuOverloadHistory.get(pm_id)
197
-
198
- history.push(appCpuUsage)
199
-
200
- // 只保留最近 x 次记录
201
- if (history.length > cpuOverloadMaxHits) {
202
- history.shift()
203
- }
204
-
205
- return history
206
- }
207
-
208
- const setZombieRestartFailedHistory = (pm_id) => {
209
- if (!zombieRestartFailedHistory.has(pm_id)) {
210
- zombieRestartFailedHistory.set(pm_id, 1)
211
- } else {
212
- zombieRestartFailedHistory.set(
213
- pm_id,
214
- zombieRestartFailedHistory.get(pm_id) + 1,
215
- )
216
- }
217
- }
218
-
219
- const setCpuOverloadRestartFailedHistory = (pm_id) => {
220
- if (!cpuOverloadRestartFailedHistory.has(pm_id)) {
221
- cpuOverloadRestartFailedHistory.set(pm_id, 1)
222
- } else {
223
- cpuOverloadRestartFailedHistory.set(
224
- pm_id,
225
- cpuOverloadRestartFailedHistory.get(pm_id) + 1,
226
- )
227
- }
228
- }
229
-
230
- /**
231
- * 发送重启警告
232
- * @param {string} title
233
- * @param {string} message
234
- */
235
- const sendRestartAlert = async (title, message) => {
236
- if (!enableAlert) return
237
-
238
- const datetime = new Date().toLocaleString()
239
- const jobHostName = getJobConfHostName(jobHostNameConfPath)
240
-
241
- return await sendAlert({
242
- cmd: alertCmdPath,
243
- env: alertEnv,
244
- level: alertLevel,
245
- title: `[${MODULE_NAME}] Alert: ${title}`,
246
- content: `[${jobHostName}] [${datetime}] - ${message}`,
247
- })
248
- }
249
-
250
- /**
251
- * check process
252
- */
253
- const processChecker = async () => {
254
- if (isProcessCheckerRunning) return
255
-
256
- try {
257
- isProcessCheckerRunning = true
258
-
259
- const apps = await listAppsAsync()
260
-
261
- cpuUsageFromPM2.clear()
262
- cpuUsageFromSys.clear()
263
-
264
- const cpus = apps.map((v) => v.monit?.cpu)
265
-
266
- const hasAbnormalMonitoringData = cpus.every((v) => typeof v !== 'number')
267
-
268
- if (hasAbnormalMonitoringData) {
269
- const details = apps.map((v) => {
270
- const { name, pid, pm_id, monit } = v
271
-
272
- return {
273
- name,
274
- pid,
275
- pm_id,
276
- monit,
277
- }
278
- })
279
-
280
- logger('warn', 'Abnormal monitoring data exists: ', details)
281
- }
282
-
283
- for (const app of apps) {
284
- const { name, pid, pm_id, monit, pm2_env } = app
285
-
286
- const appStatus = pm2_env?.status
287
-
288
- // 非目标应用,跳过
289
- if (
290
- MODULE_NAME === name ||
291
- (INCLUDE_APPS.length > 0 && !INCLUDE_APPS.includes(name)) ||
292
- (EXCLUDE_APPS.length > 0 && EXCLUDE_APPS.includes(name))
293
- ) {
294
- continue
295
- }
296
-
297
- // 只处理 online 状态的进程
298
- if (appStatus !== 'online') {
299
- // 进程不在 online 状态时,清空其历史记录,避免干扰
300
- zombieCpuHistory.delete(pm_id)
301
- cpuOverloadHistory.delete(pm_id)
302
-
303
- continue
304
- }
305
-
306
- const pm2CpuUsage = monit?.cpu
307
- const sysCpuUsage = await getSysCpuUsageByPid(pid)
308
- const appCpuUsage =
309
- typeof sysCpuUsage === 'number' ? sysCpuUsage : pm2CpuUsage
310
-
311
- cpuUsageFromPM2.set(pm_id, pm2CpuUsage)
312
- cpuUsageFromSys.set(pm_id, sysCpuUsage)
313
-
314
- const cpuHistory = setZombieCpuHistory(pm_id, appCpuUsage)
315
- const cpuHistory2 = setCpuOverloadHistory(pm_id, appCpuUsage)
316
-
317
- // 发送消息通知对应应用进程,采样 CPU 性能
318
- if (enableNodeInspectorCollection) {
319
- if (appCpuUsage >= cpuOverloadThreshold) {
320
- await sendMessage(pm_id, 'cpu-profile-start')
321
- await sleepAsync(nodeInspectorSampleDuration * 1000)
322
- await sendMessage(pm_id, 'cpu-profile-stop')
323
- }
324
- }
325
-
326
- const zombieDetected = await isZombieProcess(pid, cpuHistory)
327
-
328
- // 判断是否为僵尸:最近 ZOMBIE_MAX_HITS 次全是 0%
329
- if (ZOMBIE_DETECTION && zombieDetected) {
330
- logger(
331
- 'info',
332
- `Zombie detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
333
- )
334
-
335
- if (AUTO_RESTART_WHEN_ZOMBIE_DETECTED) {
336
- if (
337
- ZOMBIE_MAX_RESTARTS > 0 &&
338
- zombieRestartHistory.get(pm_id) >= ZOMBIE_MAX_RESTARTS
339
- ) {
340
- continue
341
- }
342
-
343
- logger('info', 'restarting...')
344
-
345
- try {
346
- await restartAppAsync(pm_id)
347
-
348
- if (!zombieRestartHistory.has(pm_id)) {
349
- zombieRestartHistory.set(pm_id, 1)
350
- } else {
351
- const history = zombieRestartHistory.get(pm_id)
352
-
353
- zombieRestartHistory.set(pm_id, history + 1)
354
- }
355
-
356
- logger(
357
- 'info',
358
- `[ZOMBIE] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${zombieRestartHistory.get(pm_id)} times`,
359
- )
360
-
361
- await sendRestartAlert(
362
- `The zombie process has been restarted!`,
363
- `appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${zombieRestartHistory.get(pm_id)} times`,
364
- )
365
-
366
- // 重启后清除该进程的历史记录,避免刚重启又被判定为僵尸
367
- zombieCpuHistory.delete(pm_id)
368
- } catch (restartErr) {
369
- logger(
370
- 'error',
371
- `[ZOMBIE] Restart failed for ${name} (pm_id: ${pm_id}):`,
372
- restartErr,
373
- )
374
-
375
- setZombieRestartFailedHistory(pm_id)
376
- }
377
- }
378
- }
379
- // CPU 是否持续过载
380
- else if (cpuOverloadDetection && isCpuOverload(cpuHistory2)) {
381
- logger(
382
- 'info',
383
- `CPU Overload detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
384
- )
385
-
386
- if (enablePerfCollection && !perfSamplingStats.get(pm_id)) {
387
- perfSamplingStats.set(pm_id, true)
388
-
389
- await performPerfSampling({
390
- pid,
391
- moduleName: MODULE_NAME,
392
- perfDir: perfReportGenerationDir,
393
- flamegraphDir,
394
- sampleDuration: perfSampleDuration,
395
- sampleFrequency: perfSampleFrequency,
396
- })
397
-
398
- perfSamplingStats.delete(pm_id)
399
- }
400
-
401
- try {
402
- logger('info', 'restarting...')
403
-
404
- await restartAppAsync(pm_id)
405
-
406
- if (!cpuOverloadRestartHistory.has(pm_id)) {
407
- cpuOverloadRestartHistory.set(pm_id, 1)
408
- } else {
409
- cpuOverloadRestartHistory.set(
410
- pm_id,
411
- cpuOverloadRestartHistory.get(pm_id) + 1,
412
- )
413
- }
414
-
415
- logger(
416
- 'info',
417
- `[CPU OVERLOAD] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${cpuOverloadRestartHistory.get(pm_id)} times`,
418
- )
419
-
420
- await sendRestartAlert(
421
- `CPU overload process restarted!`,
422
- `appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${cpuOverloadRestartHistory.get(pm_id)} times`,
423
- )
424
-
425
- cpuOverloadHistory.delete(pm_id)
426
- } catch (restartErr) {
427
- logger(
428
- 'error',
429
- `[CPU OVERLOAD] Restart failed for ${name} (pm_id: ${pm_id}):`,
430
- restartErr,
431
- )
432
-
433
- setCpuOverloadRestartFailedHistory(pm_id)
434
- }
435
- }
436
- }
437
- } catch (err) {
438
- logger('error', err)
439
- } finally {
440
- isProcessCheckerRunning = false
441
- }
442
- }
443
-
444
- const runModule = () => {
445
- if (!MODULE_ENABLED) return
446
-
447
- // connect to local pm2
448
- pm2.connect((err) => {
449
- if (err) {
450
- logger('error', `PM2 connection error:`, err)
451
-
452
- process.exit(1)
453
- }
454
-
455
- logger('info', 'Connected to PM2, starting monitor...')
456
-
457
- processChecker()
458
-
459
- setInterval(() => {
460
- processChecker()
461
- }, WORKER_INTERVAL)
462
- })
463
-
464
- /** PROB PMX **/
465
- Probe.metric({
466
- name: 'Zombie Restarts',
467
- value: () => {
468
- const res = []
469
-
470
- for (const [k, v] of zombieRestartHistory) {
471
- if (v > 0) {
472
- res.push([k, v])
473
- }
474
- }
475
-
476
- if (!res.length) return 'N/A'
477
-
478
- return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
479
- },
480
- })
481
-
482
- Probe.metric({
483
- name: 'Zombie Restarts (failed)',
484
- value: () => {
485
- const res = []
486
-
487
- for (const [k, v] of zombieRestartFailedHistory) {
488
- if (v > 0) {
489
- res.push([k, v])
490
- }
491
- }
492
-
493
- if (!res.length) return 'N/A'
494
-
495
- return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
496
- },
497
- })
498
-
499
- Probe.metric({
500
- name: 'CPU Overload Restarts',
501
- value: () => {
502
- const res = []
503
-
504
- for (const [k, v] of cpuOverloadRestartHistory) {
505
- if (v > 0) {
506
- res.push([k, v])
507
- }
508
- }
509
-
510
- if (!res.length) return 'N/A'
511
-
512
- return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
513
- },
514
- })
515
-
516
- Probe.metric({
517
- name: 'CPU Overload Restarts (failed)',
518
- value: () => {
519
- const res = []
520
-
521
- for (const [k, v] of cpuOverloadRestartFailedHistory) {
522
- if (v > 0) {
523
- res.push([k, v])
524
- }
525
- }
526
-
527
- if (!res.length) return 'N/A'
528
-
529
- return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
530
- },
531
- })
532
-
533
- Probe.metric({
534
- name: 'Processes in Sampling (perf)',
535
- value: () => {
536
- const res = []
537
-
538
- for (const [k, v] of perfSamplingStats) {
539
- if (v === true) {
540
- res.push(k)
541
- }
542
- }
543
-
544
- if (!res.length) return 'N/A'
545
-
546
- return res.join(', ')
547
- },
548
- })
549
-
550
- Probe.metric({
551
- name: 'CPU (pm2)',
552
- value: () => {
553
- const res = []
554
-
555
- for (const [k, v] of cpuUsageFromPM2) {
556
- res.push([k, v])
557
- }
558
-
559
- if (!res.length) return 'N/A'
560
-
561
- return res.map((v) => `[${v[0]}]:${v[1]}%`).join(';')
562
- },
563
- })
564
-
565
- Probe.metric({
566
- name: 'CPU (sys)',
567
- value: () => {
568
- const res = []
569
-
570
- for (const [k, v] of cpuUsageFromSys) {
571
- res.push([k, v])
572
- }
573
-
574
- if (!res.length) return 'N/A'
575
-
576
- return res.map((v) => `[${v[0]}]:${v[1]}%`).join(';')
577
- },
578
- })
579
- }
580
-
581
- runModule()
1
+ const pmx = require('pmx')
2
+ const pm2 = require('pm2')
3
+
4
+ const { listAppsAsync, restartAppAsync } = require('./pm2-extra')
5
+ const {
6
+ parseParamToArray,
7
+ parseParamToNumber,
8
+ parseBool,
9
+ safeToFixed,
10
+ sleepAsync,
11
+ getSysCpuUsageByPid,
12
+ } = require('./utils')
13
+ const { defaultOptions } = require('./defaults')
14
+ const { sendMessage } = require('./message')
15
+ const { performPerfSampling } = require('./perf-sampler')
16
+ const { sendAlert } = require('./alert')
17
+ const { getJobConfHostName } = require('./job-conf')
18
+ const { isZombieCpuProcess, isZombieStateProcess } = require('./zombie-check')
19
+
20
+ /**
21
+ * @type { defaultOptions }
22
+ */
23
+ const conf = pmx.initModule({}, (err, incomingConf) => {
24
+ if (err) {
25
+ console.error(`[${incomingConf.module_name}] init module error:`, err)
26
+ process.exit(2)
27
+ }
28
+
29
+ return {
30
+ ...defaultOptions,
31
+ ...incomingConf,
32
+ }
33
+ })
34
+
35
+ const Probe = pmx.probe()
36
+ const MODULE_NAME = conf.module_name
37
+ const MODULE_ENABLED = parseBool(conf.enabled)
38
+ const WORKER_INTERVAL = parseParamToNumber(conf.workerInterval)
39
+ const INCLUDE_APPS = parseParamToArray(conf.includeApps)
40
+ const EXCLUDE_APPS = parseParamToArray(conf.excludeApps)
41
+
42
+ // zombie conf
43
+ const ZOMBIE_DETECTION = parseBool(conf.zombieDetection)
44
+ const AUTO_RESTART_WHEN_ZOMBIE_DETECTED = parseBool(
45
+ conf.autoRestartWhenZombieDetected,
46
+ )
47
+ const ZOMBIE_MAX_HITS = parseParamToNumber(conf.zombieMaxHits)
48
+ const ZOMBIE_MAX_RESTARTS = parseParamToNumber(conf.zombieMaxRestarts)
49
+ const zombieProcessDetectionStrategy = conf.zombieProcessDetectionStrategy
50
+
51
+ // cpu conf
52
+ const cpuOverloadDetection = parseBool(conf.cpuOverloadDetection)
53
+ const cpuOverloadThreshold = parseParamToNumber(conf.cpuOverloadThreshold)
54
+ const cpuOverloadMaxHits = parseParamToNumber(conf.cpuOverloadMaxHits)
55
+ const enablePerfCollection = parseBool(conf.enablePerfCollection)
56
+ const perfReportGenerationDir = conf.perfReportGenerationDir
57
+ const flamegraphDir = conf.flamegraphDir
58
+ const perfSampleDuration = parseParamToNumber(conf.perfSampleDuration)
59
+ const perfSampleFrequency = parseParamToNumber(conf.perfSampleFrequency)
60
+ const enableNodeInspectorCollection = parseBool(
61
+ conf.enableNodeInspectorCollection,
62
+ )
63
+ const nodeInspectorSampleDuration = parseParamToNumber(
64
+ conf.nodeInspectorSampleDuration,
65
+ )
66
+
67
+ // alert conf
68
+ const enableAlert = parseBool(conf.enableAlert)
69
+ const alertCmdPath = conf.alertCmdPath
70
+ const alertEnv = conf.alertEnv
71
+ const alertLevel = conf.alertLevel
72
+ const jobHostNameConfPath = conf.jobHostNameConfPath
73
+
74
+ // 存储每个进程的 CPU 采样历史(pm_id -> [cpu1, cpu2, ...])
75
+ const zombieCpuHistory = new Map()
76
+ const zombieRestartHistory = new Map()
77
+ const zombieRestartFailedHistory = new Map()
78
+
79
+ const cpuOverloadHistory = new Map()
80
+ const cpuOverloadRestartHistory = new Map()
81
+ const cpuOverloadRestartFailedHistory = new Map()
82
+
83
+ /**
84
+ * @type { Map<number, number> }
85
+ */
86
+ const cpuUsageFromPM2 = new Map()
87
+ /**
88
+ * @type { Map<number, number> }
89
+ */
90
+ const cpuUsageFromSys = new Map()
91
+
92
+ let isProcessCheckerRunning = false
93
+
94
+ /**
95
+ * perf 样本是否采集中
96
+ * @type { Map<number,boolean> }
97
+ */
98
+ const perfSamplingStats = new Map()
99
+
100
+ /**
101
+ * @param {'log' | 'info' | 'error' | 'warn'} type
102
+ *
103
+ */
104
+ const logger = (type, ...args) => {
105
+ return console[type](`[${MODULE_NAME}]`, ...args)
106
+ }
107
+
108
+ /**
109
+ * 判断是否为僵尸进程:最近 ZOMBIE_MAX_HITS 次全是 0%
110
+ * @param { number } pid
111
+ * @param { number[] } cpus
112
+ */
113
+ const isZombieProcess = async (pid, cpus) => {
114
+ if (zombieProcessDetectionStrategy === 'zombie-state') {
115
+ const res = await isZombieStateProcess(pid)
116
+
117
+ return res.isZombie
118
+ } else if (zombieProcessDetectionStrategy === 'zero-cpu-consecutive') {
119
+ return isZombieCpuProcess({
120
+ cpus,
121
+ maxHits: ZOMBIE_MAX_HITS,
122
+ })
123
+ } else if (zombieProcessDetectionStrategy === 'zombie-state-and-zero-cpu') {
124
+ const { isZombie: zombieState, failed } = await isZombieStateProcess(pid)
125
+
126
+ const zombieCpu = isZombieCpuProcess({
127
+ cpus,
128
+ maxHits: ZOMBIE_MAX_HITS,
129
+ })
130
+
131
+ // 获取系统进程状态失败时,回退为通过 CPU 持续 0% 判定
132
+ if (failed) {
133
+ return zombieCpu
134
+ }
135
+
136
+ return zombieState && zombieCpu
137
+ } else if (zombieProcessDetectionStrategy === 'zombie-state-or-zero-cpu') {
138
+ const { isZombie: zombieState } = await isZombieStateProcess(pid)
139
+
140
+ const zombieCpu = isZombieCpuProcess({
141
+ cpus,
142
+ maxHits: ZOMBIE_MAX_HITS,
143
+ })
144
+
145
+ return zombieState || zombieCpu
146
+ }
147
+
148
+ return isZombieCpuProcess({
149
+ cpus,
150
+ maxHits: ZOMBIE_MAX_HITS,
151
+ })
152
+ }
153
+
154
+ /**
155
+ * @param { number[] } cpus
156
+ */
157
+ const isCpuOverload = (cpus) => {
158
+ return (
159
+ cpus.length >= cpuOverloadMaxHits &&
160
+ cpus.every((v) => v >= cpuOverloadThreshold)
161
+ )
162
+ }
163
+
164
+ /**
165
+ * @param { number } pm_id
166
+ * @param { number } appCpuUsage
167
+ * @returns { number[] } 对应 pm_id 的 CPU 使用率数组
168
+ */
169
+ const setZombieCpuHistory = (pm_id, appCpuUsage) => {
170
+ if (!zombieCpuHistory.has(pm_id)) {
171
+ zombieCpuHistory.set(pm_id, [])
172
+ }
173
+
174
+ const history = zombieCpuHistory.get(pm_id)
175
+
176
+ history.push(appCpuUsage)
177
+
178
+ // 只保留最近 ZOMBIE_MAX_HITS 次记录
179
+ if (history.length > ZOMBIE_MAX_HITS) {
180
+ history.shift()
181
+ }
182
+
183
+ return history
184
+ }
185
+
186
+ /**
187
+ * @param { number } pm_id
188
+ * @param { number } appCpuUsage
189
+ * @returns { number[] } 对应 pm_id 的 CPU 使用率数组
190
+ */
191
+ const setCpuOverloadHistory = (pm_id, appCpuUsage) => {
192
+ if (!cpuOverloadHistory.has(pm_id)) {
193
+ cpuOverloadHistory.set(pm_id, [])
194
+ }
195
+
196
+ const history = cpuOverloadHistory.get(pm_id)
197
+
198
+ history.push(appCpuUsage)
199
+
200
+ // 只保留最近 x 次记录
201
+ if (history.length > cpuOverloadMaxHits) {
202
+ history.shift()
203
+ }
204
+
205
+ return history
206
+ }
207
+
208
+ const setZombieRestartFailedHistory = (pm_id) => {
209
+ if (!zombieRestartFailedHistory.has(pm_id)) {
210
+ zombieRestartFailedHistory.set(pm_id, 1)
211
+ } else {
212
+ zombieRestartFailedHistory.set(
213
+ pm_id,
214
+ zombieRestartFailedHistory.get(pm_id) + 1,
215
+ )
216
+ }
217
+ }
218
+
219
+ const setCpuOverloadRestartFailedHistory = (pm_id) => {
220
+ if (!cpuOverloadRestartFailedHistory.has(pm_id)) {
221
+ cpuOverloadRestartFailedHistory.set(pm_id, 1)
222
+ } else {
223
+ cpuOverloadRestartFailedHistory.set(
224
+ pm_id,
225
+ cpuOverloadRestartFailedHistory.get(pm_id) + 1,
226
+ )
227
+ }
228
+ }
229
+
230
+ /**
231
+ * 发送重启警告
232
+ * @param {string} title
233
+ * @param {string} message
234
+ */
235
+ const sendRestartAlert = async (title, message) => {
236
+ if (!enableAlert) return
237
+
238
+ const datetime = new Date().toLocaleString()
239
+ const jobHostName = getJobConfHostName(jobHostNameConfPath)
240
+
241
+ return await sendAlert({
242
+ cmd: alertCmdPath,
243
+ env: alertEnv,
244
+ level: alertLevel,
245
+ title: `[${MODULE_NAME}] Alert: ${title}`,
246
+ content: `[${jobHostName}] [${datetime}] - ${message}`,
247
+ })
248
+ }
249
+
250
+ /**
251
+ * check process
252
+ */
253
+ const processChecker = async () => {
254
+ if (isProcessCheckerRunning) return
255
+
256
+ try {
257
+ isProcessCheckerRunning = true
258
+
259
+ const apps = await listAppsAsync()
260
+
261
+ cpuUsageFromPM2.clear()
262
+ cpuUsageFromSys.clear()
263
+
264
+ const cpus = apps.map((v) => v.monit?.cpu)
265
+
266
+ const hasAbnormalMonitoringData = cpus.every((v) => typeof v !== 'number')
267
+
268
+ if (hasAbnormalMonitoringData) {
269
+ const details = apps.map((v) => {
270
+ const { name, pid, pm_id, monit } = v
271
+
272
+ return {
273
+ name,
274
+ pid,
275
+ pm_id,
276
+ monit,
277
+ }
278
+ })
279
+
280
+ logger('warn', 'Abnormal monitoring data exists: ', details)
281
+ }
282
+
283
+ for (const app of apps) {
284
+ const { name, pid, pm_id, monit, pm2_env } = app
285
+
286
+ const appStatus = pm2_env?.status
287
+
288
+ // 非目标应用,跳过
289
+ if (
290
+ MODULE_NAME === name ||
291
+ (INCLUDE_APPS.length > 0 && !INCLUDE_APPS.includes(name)) ||
292
+ (EXCLUDE_APPS.length > 0 && EXCLUDE_APPS.includes(name))
293
+ ) {
294
+ continue
295
+ }
296
+
297
+ // 只处理 online 状态的进程
298
+ if (appStatus !== 'online') {
299
+ // 进程不在 online 状态时,清空其历史记录,避免干扰
300
+ zombieCpuHistory.delete(pm_id)
301
+ cpuOverloadHistory.delete(pm_id)
302
+
303
+ continue
304
+ }
305
+
306
+ const pm2CpuUsage = monit?.cpu
307
+ const sysCpuUsage = await getSysCpuUsageByPid(pid)
308
+ const appCpuUsage =
309
+ typeof sysCpuUsage === 'number' ? sysCpuUsage : pm2CpuUsage
310
+
311
+ cpuUsageFromPM2.set(pm_id, pm2CpuUsage)
312
+ cpuUsageFromSys.set(pm_id, sysCpuUsage)
313
+
314
+ const cpuHistory = setZombieCpuHistory(pm_id, appCpuUsage)
315
+ const cpuHistory2 = setCpuOverloadHistory(pm_id, appCpuUsage)
316
+
317
+ // 发送消息通知对应应用进程,采样 CPU 性能
318
+ if (enableNodeInspectorCollection) {
319
+ if (appCpuUsage >= cpuOverloadThreshold) {
320
+ await sendMessage(pm_id, 'cpu-profile-start')
321
+ await sleepAsync(nodeInspectorSampleDuration * 1000)
322
+ await sendMessage(pm_id, 'cpu-profile-stop')
323
+ }
324
+ }
325
+
326
+ const zombieDetected = await isZombieProcess(pid, cpuHistory)
327
+
328
+ // 判断是否为僵尸:最近 ZOMBIE_MAX_HITS 次全是 0%
329
+ if (ZOMBIE_DETECTION && zombieDetected) {
330
+ logger(
331
+ 'info',
332
+ `Zombie detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
333
+ )
334
+
335
+ if (AUTO_RESTART_WHEN_ZOMBIE_DETECTED) {
336
+ if (
337
+ ZOMBIE_MAX_RESTARTS > 0 &&
338
+ zombieRestartHistory.get(pm_id) >= ZOMBIE_MAX_RESTARTS
339
+ ) {
340
+ continue
341
+ }
342
+
343
+ logger('info', 'restarting...')
344
+
345
+ try {
346
+ await restartAppAsync(pm_id)
347
+
348
+ if (!zombieRestartHistory.has(pm_id)) {
349
+ zombieRestartHistory.set(pm_id, 1)
350
+ } else {
351
+ const history = zombieRestartHistory.get(pm_id)
352
+
353
+ zombieRestartHistory.set(pm_id, history + 1)
354
+ }
355
+
356
+ logger(
357
+ 'info',
358
+ `[ZOMBIE] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${zombieRestartHistory.get(pm_id)} times`,
359
+ )
360
+
361
+ await sendRestartAlert(
362
+ `The zombie process has been restarted!`,
363
+ `appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${zombieRestartHistory.get(pm_id)} times`,
364
+ )
365
+
366
+ // 重启后清除该进程的历史记录,避免刚重启又被判定为僵尸
367
+ zombieCpuHistory.delete(pm_id)
368
+ } catch (restartErr) {
369
+ logger(
370
+ 'error',
371
+ `[ZOMBIE] Restart failed for ${name} (pm_id: ${pm_id}):`,
372
+ restartErr,
373
+ )
374
+
375
+ setZombieRestartFailedHistory(pm_id)
376
+ }
377
+ }
378
+ }
379
+ // CPU 是否持续过载
380
+ else if (cpuOverloadDetection && isCpuOverload(cpuHistory2)) {
381
+ logger(
382
+ 'info',
383
+ `CPU Overload detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
384
+ )
385
+
386
+ if (enablePerfCollection && !perfSamplingStats.get(pm_id)) {
387
+ perfSamplingStats.set(pm_id, true)
388
+
389
+ await performPerfSampling({
390
+ pid,
391
+ moduleName: MODULE_NAME,
392
+ perfDir: perfReportGenerationDir,
393
+ flamegraphDir,
394
+ sampleDuration: perfSampleDuration,
395
+ sampleFrequency: perfSampleFrequency,
396
+ })
397
+
398
+ perfSamplingStats.delete(pm_id)
399
+ }
400
+
401
+ try {
402
+ logger('info', 'restarting...')
403
+
404
+ await restartAppAsync(pm_id)
405
+
406
+ if (!cpuOverloadRestartHistory.has(pm_id)) {
407
+ cpuOverloadRestartHistory.set(pm_id, 1)
408
+ } else {
409
+ cpuOverloadRestartHistory.set(
410
+ pm_id,
411
+ cpuOverloadRestartHistory.get(pm_id) + 1,
412
+ )
413
+ }
414
+
415
+ logger(
416
+ 'info',
417
+ `[CPU OVERLOAD] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${cpuOverloadRestartHistory.get(pm_id)} times`,
418
+ )
419
+
420
+ await sendRestartAlert(
421
+ `CPU overload process restarted!`,
422
+ `appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${cpuOverloadRestartHistory.get(pm_id)} times`,
423
+ )
424
+
425
+ cpuOverloadHistory.delete(pm_id)
426
+ } catch (restartErr) {
427
+ logger(
428
+ 'error',
429
+ `[CPU OVERLOAD] Restart failed for ${name} (pm_id: ${pm_id}):`,
430
+ restartErr,
431
+ )
432
+
433
+ setCpuOverloadRestartFailedHistory(pm_id)
434
+ }
435
+ }
436
+ }
437
+ } catch (err) {
438
+ logger('error', err)
439
+ } finally {
440
+ isProcessCheckerRunning = false
441
+ }
442
+ }
443
+
444
+ const runModule = () => {
445
+ if (!MODULE_ENABLED) return
446
+
447
+ // connect to local pm2
448
+ pm2.connect((err) => {
449
+ if (err) {
450
+ logger('error', `PM2 connection error:`, err)
451
+
452
+ process.exit(1)
453
+ }
454
+
455
+ logger('info', 'Connected to PM2, starting monitor...')
456
+
457
+ processChecker()
458
+
459
+ setInterval(() => {
460
+ processChecker()
461
+ }, WORKER_INTERVAL)
462
+ })
463
+
464
+ /** PROB PMX **/
465
+ Probe.metric({
466
+ name: 'Zombie Restarts',
467
+ value: () => {
468
+ const res = []
469
+
470
+ for (const [k, v] of zombieRestartHistory) {
471
+ if (v > 0) {
472
+ res.push([k, v])
473
+ }
474
+ }
475
+
476
+ if (!res.length) return 'N/A'
477
+
478
+ return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
479
+ },
480
+ })
481
+
482
+ Probe.metric({
483
+ name: 'Zombie Restarts (failed)',
484
+ value: () => {
485
+ const res = []
486
+
487
+ for (const [k, v] of zombieRestartFailedHistory) {
488
+ if (v > 0) {
489
+ res.push([k, v])
490
+ }
491
+ }
492
+
493
+ if (!res.length) return 'N/A'
494
+
495
+ return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
496
+ },
497
+ })
498
+
499
+ Probe.metric({
500
+ name: 'CPU Overload Restarts',
501
+ value: () => {
502
+ const res = []
503
+
504
+ for (const [k, v] of cpuOverloadRestartHistory) {
505
+ if (v > 0) {
506
+ res.push([k, v])
507
+ }
508
+ }
509
+
510
+ if (!res.length) return 'N/A'
511
+
512
+ return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
513
+ },
514
+ })
515
+
516
+ Probe.metric({
517
+ name: 'CPU Overload Restarts (failed)',
518
+ value: () => {
519
+ const res = []
520
+
521
+ for (const [k, v] of cpuOverloadRestartFailedHistory) {
522
+ if (v > 0) {
523
+ res.push([k, v])
524
+ }
525
+ }
526
+
527
+ if (!res.length) return 'N/A'
528
+
529
+ return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
530
+ },
531
+ })
532
+
533
+ Probe.metric({
534
+ name: 'Processes in Sampling (perf)',
535
+ value: () => {
536
+ const res = []
537
+
538
+ for (const [k, v] of perfSamplingStats) {
539
+ if (v === true) {
540
+ res.push(k)
541
+ }
542
+ }
543
+
544
+ if (!res.length) return 'N/A'
545
+
546
+ return res.join(', ')
547
+ },
548
+ })
549
+
550
+ Probe.metric({
551
+ name: 'CPU (pm2)',
552
+ value: () => {
553
+ const res = []
554
+
555
+ for (const [k, v] of cpuUsageFromPM2) {
556
+ res.push([k, v])
557
+ }
558
+
559
+ if (!res.length) return 'N/A'
560
+
561
+ return res.map((v) => `[${v[0]}]:${v[1]}%`).join(';')
562
+ },
563
+ })
564
+
565
+ Probe.metric({
566
+ name: 'CPU (sys)',
567
+ value: () => {
568
+ const res = []
569
+
570
+ for (const [k, v] of cpuUsageFromSys) {
571
+ res.push([k, v])
572
+ }
573
+
574
+ if (!res.length) return 'N/A'
575
+
576
+ return res.map((v) => `[${v[0]}]:${v[1]}%`).join(';')
577
+ },
578
+ })
579
+ }
580
+
581
+ runModule()