pm2-perfmonitor 2.4.2 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +59 -55
- package/lib/alert.js +29 -29
- package/lib/app.js +516 -499
- package/lib/defaults.js +123 -104
- package/lib/execa-helper.js +17 -17
- package/lib/job-conf.js +39 -39
- package/lib/message.js +35 -35
- package/lib/perf-sampler.js +241 -239
- package/lib/pm2-extra.js +54 -54
- package/lib/utils.js +62 -62
- package/lib/zombie-check.js +65 -0
- package/package.json +11 -2
package/lib/app.js
CHANGED
|
@@ -1,499 +1,516 @@
|
|
|
1
|
-
const pmx = require('pmx')
|
|
2
|
-
const pm2 = require('pm2')
|
|
3
|
-
|
|
4
|
-
const {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
const {
|
|
13
|
-
const {
|
|
14
|
-
const {
|
|
15
|
-
const {
|
|
16
|
-
|
|
17
|
-
const
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
const
|
|
35
|
-
const
|
|
36
|
-
const
|
|
37
|
-
|
|
38
|
-
)
|
|
39
|
-
const
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
const
|
|
43
|
-
const
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
const
|
|
47
|
-
const
|
|
48
|
-
const
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
)
|
|
53
|
-
const
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
const
|
|
57
|
-
const
|
|
58
|
-
const
|
|
59
|
-
const
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
const
|
|
68
|
-
const
|
|
69
|
-
const
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
//
|
|
294
|
-
|
|
295
|
-
logger(
|
|
296
|
-
'info',
|
|
297
|
-
`
|
|
298
|
-
)
|
|
299
|
-
|
|
300
|
-
if (
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
Probe.metric({
|
|
431
|
-
name: '
|
|
432
|
-
value: () => {
|
|
433
|
-
const res = []
|
|
434
|
-
|
|
435
|
-
for (const [k, v] of
|
|
436
|
-
if (v > 0) {
|
|
437
|
-
res.push([k, v])
|
|
438
|
-
}
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
if (!res.length) return 'N/A'
|
|
442
|
-
|
|
443
|
-
return res.map((v) => `[${v[0]}]:${v[1]}`).join('
|
|
444
|
-
},
|
|
445
|
-
})
|
|
446
|
-
|
|
447
|
-
Probe.metric({
|
|
448
|
-
name: '
|
|
449
|
-
value: () => {
|
|
450
|
-
const res = []
|
|
451
|
-
|
|
452
|
-
for (const [
|
|
453
|
-
if (
|
|
454
|
-
res.push(
|
|
455
|
-
}
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
if (!res.length) return 'N/A'
|
|
459
|
-
|
|
460
|
-
return res.join('
|
|
461
|
-
},
|
|
462
|
-
})
|
|
463
|
-
|
|
464
|
-
Probe.metric({
|
|
465
|
-
name: 'CPU Overload Restarts
|
|
466
|
-
value: () => {
|
|
467
|
-
const res = []
|
|
468
|
-
|
|
469
|
-
for (const [k, v] of
|
|
470
|
-
if (v > 0) {
|
|
471
|
-
res.push([k, v])
|
|
472
|
-
}
|
|
473
|
-
}
|
|
474
|
-
|
|
475
|
-
if (!res.length) return 'N/A'
|
|
476
|
-
|
|
477
|
-
return res.map((v) => `[${v[0]}]:${v[1]}`).join('
|
|
478
|
-
},
|
|
479
|
-
})
|
|
480
|
-
|
|
481
|
-
Probe.metric({
|
|
482
|
-
name: '
|
|
483
|
-
value: () => {
|
|
484
|
-
const res = []
|
|
485
|
-
|
|
486
|
-
for (const [k, v] of
|
|
487
|
-
if (v
|
|
488
|
-
res.push(k)
|
|
489
|
-
}
|
|
490
|
-
}
|
|
491
|
-
|
|
492
|
-
if (!res.length) return 'N/A'
|
|
493
|
-
|
|
494
|
-
return res.join('
|
|
495
|
-
},
|
|
496
|
-
})
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
1
|
+
const pmx = require('pmx')
|
|
2
|
+
const pm2 = require('pm2')
|
|
3
|
+
|
|
4
|
+
const { listAppsAsync, restartAppAsync } = require('./pm2-extra')
|
|
5
|
+
const {
|
|
6
|
+
parseParamToArray,
|
|
7
|
+
parseParamToNumber,
|
|
8
|
+
parseBool,
|
|
9
|
+
sleepAsync,
|
|
10
|
+
getSysCpuUsageByPid,
|
|
11
|
+
} = require('./utils')
|
|
12
|
+
const { defaultOptions } = require('./defaults')
|
|
13
|
+
const { sendMessage } = require('./message')
|
|
14
|
+
const { performPerfSampling } = require('./perf-sampler')
|
|
15
|
+
const { sendAlert } = require('./alert')
|
|
16
|
+
const { getJobConfHostName } = require('./job-conf')
|
|
17
|
+
const { isZombieCpuProcess, isZombieStateProcess } = require('./zombie-check')
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* @type { defaultOptions }
|
|
21
|
+
*/
|
|
22
|
+
const conf = pmx.initModule({}, (err, incomingConf) => {
|
|
23
|
+
if (err) {
|
|
24
|
+
console.error(`[${incomingConf.module_name}] init module error:`, err)
|
|
25
|
+
process.exit(2)
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
return {
|
|
29
|
+
...defaultOptions,
|
|
30
|
+
...incomingConf,
|
|
31
|
+
}
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
const Probe = pmx.probe()
|
|
35
|
+
const MODULE_NAME = conf.module_name
|
|
36
|
+
const MODULE_ENABLED = parseBool(conf.enabled)
|
|
37
|
+
const WORKER_INTERVAL = parseParamToNumber(conf.workerInterval)
|
|
38
|
+
const INCLUDE_APPS = parseParamToArray(conf.includeApps)
|
|
39
|
+
const EXCLUDE_APPS = parseParamToArray(conf.excludeApps)
|
|
40
|
+
|
|
41
|
+
// zombie conf
|
|
42
|
+
const ZOMBIE_DETECTION = parseBool(conf.zombieDetection)
|
|
43
|
+
const AUTO_RESTART_WHEN_ZOMBIE_DETECTED = parseBool(
|
|
44
|
+
conf.autoRestartWhenZombieDetected,
|
|
45
|
+
)
|
|
46
|
+
const ZOMBIE_MAX_HITS = parseParamToNumber(conf.zombieMaxHits)
|
|
47
|
+
const ZOMBIE_MAX_RESTARTS = parseParamToNumber(conf.zombieMaxRestarts)
|
|
48
|
+
const zombieProcessDetectionStrategy = conf.zombieProcessDetectionStrategy
|
|
49
|
+
|
|
50
|
+
// cpu conf
|
|
51
|
+
const cpuOverloadDetection = parseBool(conf.cpuOverloadDetection)
|
|
52
|
+
const cpuOverloadThreshold = parseParamToNumber(conf.cpuOverloadThreshold)
|
|
53
|
+
const cpuOverloadMaxHits = parseParamToNumber(conf.cpuOverloadMaxHits)
|
|
54
|
+
const enablePerfCollection = parseBool(conf.enablePerfCollection)
|
|
55
|
+
const perfReportGenerationDir = conf.perfReportGenerationDir
|
|
56
|
+
const flamegraphDir = conf.flamegraphDir
|
|
57
|
+
const perfSampleDuration = parseParamToNumber(conf.perfSampleDuration)
|
|
58
|
+
const perfSampleFrequency = parseParamToNumber(conf.perfSampleFrequency)
|
|
59
|
+
const enableNodeInspectorCollection = parseBool(
|
|
60
|
+
conf.enableNodeInspectorCollection,
|
|
61
|
+
)
|
|
62
|
+
const nodeInspectorSampleDuration = parseParamToNumber(
|
|
63
|
+
conf.nodeInspectorSampleDuration,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
// alert conf
|
|
67
|
+
const enableAlert = parseBool(conf.enableAlert)
|
|
68
|
+
const alertCmdPath = conf.alertCmdPath
|
|
69
|
+
const alertEnv = conf.alertEnv
|
|
70
|
+
const alertLevel = conf.alertLevel
|
|
71
|
+
const jobHostNameConfPath = conf.jobHostNameConfPath
|
|
72
|
+
|
|
73
|
+
// 存储每个进程的 CPU 采样历史(pm_id -> [cpu1, cpu2, ...])
|
|
74
|
+
const zombieCpuHistory = new Map()
|
|
75
|
+
const zombieRestartHistory = new Map()
|
|
76
|
+
const zombieRestartFailedHistory = new Map()
|
|
77
|
+
|
|
78
|
+
const cpuOverloadHistory = new Map()
|
|
79
|
+
const cpuOverloadRestartHistory = new Map()
|
|
80
|
+
const cpuOverloadRestartFailedHistory = new Map()
|
|
81
|
+
|
|
82
|
+
let isProcessCheckerRunning = false
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* perf 样本是否采集中
|
|
86
|
+
* @type { Map<number,boolean> }
|
|
87
|
+
*/
|
|
88
|
+
const perfSamplingStats = new Map()
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* @param {'log' | 'info' | 'error' | 'warn'} type
|
|
92
|
+
*
|
|
93
|
+
*/
|
|
94
|
+
const logger = (type, ...args) => {
|
|
95
|
+
return console[type](`[${MODULE_NAME}]`, ...args)
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* 判断是否为僵尸进程:最近 ZOMBIE_MAX_HITS 次全是 0%
|
|
100
|
+
* @param { number } pid
|
|
101
|
+
* @param { number[] } cpus
|
|
102
|
+
*/
|
|
103
|
+
const isZombieProcess = async (pid, cpus) => {
|
|
104
|
+
if (zombieProcessDetectionStrategy === 'zombie-state') {
|
|
105
|
+
const res = await isZombieStateProcess(pid)
|
|
106
|
+
|
|
107
|
+
return res.isZombie
|
|
108
|
+
} else if (zombieProcessDetectionStrategy === 'zero-cpu-consecutive') {
|
|
109
|
+
return isZombieCpuProcess({
|
|
110
|
+
cpus,
|
|
111
|
+
maxHits: ZOMBIE_MAX_HITS,
|
|
112
|
+
})
|
|
113
|
+
} else if (zombieProcessDetectionStrategy === 'zombie-state-and-zero-cpu') {
|
|
114
|
+
const { isZombie: zombieState, failed } = await isZombieStateProcess(pid)
|
|
115
|
+
|
|
116
|
+
const zombieCpu = isZombieCpuProcess({
|
|
117
|
+
cpus,
|
|
118
|
+
maxHits: ZOMBIE_MAX_HITS,
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
// 获取系统进程状态失败时,回退为通过 CPU 持续 0% 判定
|
|
122
|
+
if (failed) {
|
|
123
|
+
return zombieCpu
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return zombieState && zombieCpu
|
|
127
|
+
} else if (zombieProcessDetectionStrategy === 'zombie-state-or-zero-cpu') {
|
|
128
|
+
const { isZombie: zombieState } = await isZombieStateProcess(pid)
|
|
129
|
+
|
|
130
|
+
const zombieCpu = isZombieCpuProcess({
|
|
131
|
+
cpus,
|
|
132
|
+
maxHits: ZOMBIE_MAX_HITS,
|
|
133
|
+
})
|
|
134
|
+
|
|
135
|
+
return zombieState || zombieCpu
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
return isZombieCpuProcess({
|
|
139
|
+
cpus,
|
|
140
|
+
maxHits: ZOMBIE_MAX_HITS,
|
|
141
|
+
})
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* @param { number[] } cpus
|
|
146
|
+
*/
|
|
147
|
+
const isCpuOverload = (cpus) => {
|
|
148
|
+
return (
|
|
149
|
+
cpus.length >= cpuOverloadMaxHits &&
|
|
150
|
+
cpus.every((v) => v >= cpuOverloadThreshold)
|
|
151
|
+
)
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* @param { number } pm_id
|
|
156
|
+
* @param { number } appCpuUsage
|
|
157
|
+
* @returns { number[] } 对应 pm_id 的 CPU 使用率数组
|
|
158
|
+
*/
|
|
159
|
+
const setZombieCpuHistory = (pm_id, appCpuUsage) => {
|
|
160
|
+
if (!zombieCpuHistory.has(pm_id)) {
|
|
161
|
+
zombieCpuHistory.set(pm_id, [])
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
const history = zombieCpuHistory.get(pm_id)
|
|
165
|
+
|
|
166
|
+
history.push(appCpuUsage)
|
|
167
|
+
|
|
168
|
+
// 只保留最近 ZOMBIE_MAX_HITS 次记录
|
|
169
|
+
if (history.length > ZOMBIE_MAX_HITS) {
|
|
170
|
+
history.shift()
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return history
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* @param { number } pm_id
|
|
178
|
+
* @param { number } appCpuUsage
|
|
179
|
+
* @returns { number[] } 对应 pm_id 的 CPU 使用率数组
|
|
180
|
+
*/
|
|
181
|
+
const setCpuOverloadHistory = (pm_id, appCpuUsage) => {
|
|
182
|
+
if (!cpuOverloadHistory.has(pm_id)) {
|
|
183
|
+
cpuOverloadHistory.set(pm_id, [])
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const history = cpuOverloadHistory.get(pm_id)
|
|
187
|
+
|
|
188
|
+
history.push(appCpuUsage)
|
|
189
|
+
|
|
190
|
+
// 只保留最近 x 次记录
|
|
191
|
+
if (history.length > cpuOverloadMaxHits) {
|
|
192
|
+
history.shift()
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return history
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const setZombieRestartFailedHistory = (pm_id) => {
|
|
199
|
+
if (!zombieRestartFailedHistory.has(pm_id)) {
|
|
200
|
+
zombieRestartFailedHistory.set(pm_id, 1)
|
|
201
|
+
} else {
|
|
202
|
+
zombieRestartFailedHistory.set(
|
|
203
|
+
pm_id,
|
|
204
|
+
zombieRestartFailedHistory.get(pm_id) + 1,
|
|
205
|
+
)
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const setCpuOverloadRestartFailedHistory = (pm_id) => {
|
|
210
|
+
if (!cpuOverloadRestartFailedHistory.has(pm_id)) {
|
|
211
|
+
cpuOverloadRestartFailedHistory.set(pm_id, 1)
|
|
212
|
+
} else {
|
|
213
|
+
cpuOverloadRestartFailedHistory.set(
|
|
214
|
+
pm_id,
|
|
215
|
+
cpuOverloadRestartFailedHistory.get(pm_id) + 1,
|
|
216
|
+
)
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* 发送重启警告
|
|
222
|
+
* @param {string} title
|
|
223
|
+
* @param {string} message
|
|
224
|
+
*/
|
|
225
|
+
const sendRestartAlert = async (title, message) => {
|
|
226
|
+
if (!enableAlert) return
|
|
227
|
+
|
|
228
|
+
const datetime = new Date().toLocaleString()
|
|
229
|
+
const jobHostName = getJobConfHostName(jobHostNameConfPath)
|
|
230
|
+
|
|
231
|
+
return await sendAlert({
|
|
232
|
+
cmd: alertCmdPath,
|
|
233
|
+
env: alertEnv,
|
|
234
|
+
level: alertLevel,
|
|
235
|
+
title: `[${MODULE_NAME}] Alert: ${title}`,
|
|
236
|
+
content: `[${jobHostName}] [${datetime}] - ${message}`,
|
|
237
|
+
})
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* check process
|
|
242
|
+
*/
|
|
243
|
+
const processChecker = async () => {
|
|
244
|
+
if (isProcessCheckerRunning) return
|
|
245
|
+
|
|
246
|
+
try {
|
|
247
|
+
isProcessCheckerRunning = true
|
|
248
|
+
|
|
249
|
+
const apps = await listAppsAsync()
|
|
250
|
+
|
|
251
|
+
for (const app of apps) {
|
|
252
|
+
const { name, pid, pm_id, monit, pm2_env } = app
|
|
253
|
+
|
|
254
|
+
const appStatus = pm2_env?.status
|
|
255
|
+
|
|
256
|
+
// 非目标应用,跳过
|
|
257
|
+
if (
|
|
258
|
+
MODULE_NAME === name ||
|
|
259
|
+
(INCLUDE_APPS.length > 0 && !INCLUDE_APPS.includes(name)) ||
|
|
260
|
+
(EXCLUDE_APPS.length > 0 && EXCLUDE_APPS.includes(name))
|
|
261
|
+
) {
|
|
262
|
+
continue
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// 只处理 online 状态的进程
|
|
266
|
+
if (appStatus !== 'online') {
|
|
267
|
+
// 进程不在 online 状态时,清空其历史记录,避免干扰
|
|
268
|
+
zombieCpuHistory.delete(pm_id)
|
|
269
|
+
cpuOverloadHistory.delete(pm_id)
|
|
270
|
+
|
|
271
|
+
continue
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
const pm2CpuUsage = monit?.cpu
|
|
275
|
+
const sysCpuUsage = await getSysCpuUsageByPid(pid)
|
|
276
|
+
const appCpuUsage =
|
|
277
|
+
typeof sysCpuUsage === 'number' ? sysCpuUsage : pm2CpuUsage
|
|
278
|
+
|
|
279
|
+
const cpuHistory = setZombieCpuHistory(pm_id, appCpuUsage)
|
|
280
|
+
const cpuHistory2 = setCpuOverloadHistory(pm_id, appCpuUsage)
|
|
281
|
+
|
|
282
|
+
// 发送消息通知对应应用进程,采样 CPU 性能
|
|
283
|
+
if (enableNodeInspectorCollection) {
|
|
284
|
+
if (appCpuUsage >= cpuOverloadThreshold) {
|
|
285
|
+
await sendMessage(pm_id, 'cpu-profile-start')
|
|
286
|
+
await sleepAsync(nodeInspectorSampleDuration * 1000)
|
|
287
|
+
await sendMessage(pm_id, 'cpu-profile-stop')
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
const zombieDetected = await isZombieProcess(pid, cpuHistory)
|
|
292
|
+
|
|
293
|
+
// 判断是否为僵尸:最近 ZOMBIE_MAX_HITS 次全是 0%
|
|
294
|
+
if (ZOMBIE_DETECTION && zombieDetected) {
|
|
295
|
+
logger(
|
|
296
|
+
'info',
|
|
297
|
+
`Zombie detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
if (AUTO_RESTART_WHEN_ZOMBIE_DETECTED) {
|
|
301
|
+
if (
|
|
302
|
+
ZOMBIE_MAX_RESTARTS > 0 &&
|
|
303
|
+
zombieRestartHistory.get(pm_id) >= ZOMBIE_MAX_RESTARTS
|
|
304
|
+
) {
|
|
305
|
+
continue
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
logger('info', 'restarting...')
|
|
309
|
+
|
|
310
|
+
try {
|
|
311
|
+
await restartAppAsync(pm_id)
|
|
312
|
+
|
|
313
|
+
if (!zombieRestartHistory.has(pm_id)) {
|
|
314
|
+
zombieRestartHistory.set(pm_id, 1)
|
|
315
|
+
} else {
|
|
316
|
+
const history = zombieRestartHistory.get(pm_id)
|
|
317
|
+
|
|
318
|
+
zombieRestartHistory.set(pm_id, history + 1)
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
logger(
|
|
322
|
+
'info',
|
|
323
|
+
`[ZOMBIE] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${zombieRestartHistory.get(pm_id)} times`,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
await sendRestartAlert(
|
|
327
|
+
`The zombie process has been restarted!`,
|
|
328
|
+
`appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${zombieRestartHistory.get(pm_id)} times`,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
// 重启后清除该进程的历史记录,避免刚重启又被判定为僵尸
|
|
332
|
+
zombieCpuHistory.delete(pm_id)
|
|
333
|
+
} catch (restartErr) {
|
|
334
|
+
logger(
|
|
335
|
+
'error',
|
|
336
|
+
`[ZOMBIE] Restart failed for ${name} (pm_id: ${pm_id}):`,
|
|
337
|
+
restartErr,
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
setZombieRestartFailedHistory(pm_id)
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
// CPU 是否持续过载
|
|
345
|
+
else if (cpuOverloadDetection && isCpuOverload(cpuHistory2)) {
|
|
346
|
+
logger(
|
|
347
|
+
'info',
|
|
348
|
+
`CPU Overload detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
if (enablePerfCollection) {
|
|
352
|
+
perfSamplingStats.set(pm_id, true)
|
|
353
|
+
|
|
354
|
+
await performPerfSampling({
|
|
355
|
+
pid,
|
|
356
|
+
moduleName: MODULE_NAME,
|
|
357
|
+
perfDir: perfReportGenerationDir,
|
|
358
|
+
flamegraphDir,
|
|
359
|
+
sampleDuration: perfSampleDuration,
|
|
360
|
+
sampleFrequency: perfSampleFrequency,
|
|
361
|
+
})
|
|
362
|
+
|
|
363
|
+
perfSamplingStats.delete(pm_id)
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
try {
|
|
367
|
+
logger('info', 'restarting...')
|
|
368
|
+
|
|
369
|
+
await restartAppAsync(pm_id)
|
|
370
|
+
|
|
371
|
+
if (!cpuOverloadRestartHistory.has(pm_id)) {
|
|
372
|
+
cpuOverloadRestartHistory.set(pm_id, 1)
|
|
373
|
+
} else {
|
|
374
|
+
cpuOverloadRestartHistory.set(
|
|
375
|
+
pm_id,
|
|
376
|
+
cpuOverloadRestartHistory.get(pm_id) + 1,
|
|
377
|
+
)
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
logger(
|
|
381
|
+
'info',
|
|
382
|
+
`[CPU OVERLOAD] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${cpuOverloadRestartHistory.get(pm_id)} times`,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
await sendRestartAlert(
|
|
386
|
+
`CPU overload process restarted!`,
|
|
387
|
+
`appName: ${name}, pid: ${pid}, pm_id: ${pm_id}, restarted: ${cpuOverloadRestartHistory.get(pm_id)} times`,
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
cpuOverloadHistory.delete(pm_id)
|
|
391
|
+
} catch (restartErr) {
|
|
392
|
+
logger(
|
|
393
|
+
'error',
|
|
394
|
+
`[CPU OVERLOAD] Restart failed for ${name} (pm_id: ${pm_id}):`,
|
|
395
|
+
restartErr,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
setCpuOverloadRestartFailedHistory(pm_id)
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
} catch (err) {
|
|
403
|
+
logger('error', err)
|
|
404
|
+
} finally {
|
|
405
|
+
isProcessCheckerRunning = false
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
const runModule = () => {
|
|
410
|
+
if (!MODULE_ENABLED) return
|
|
411
|
+
|
|
412
|
+
// connect to local pm2
|
|
413
|
+
pm2.connect((err) => {
|
|
414
|
+
if (err) {
|
|
415
|
+
logger('error', `PM2 connection error:`, err)
|
|
416
|
+
|
|
417
|
+
process.exit(1)
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
logger('info', 'Connected to PM2, starting monitor...')
|
|
421
|
+
|
|
422
|
+
processChecker()
|
|
423
|
+
|
|
424
|
+
setInterval(() => {
|
|
425
|
+
processChecker()
|
|
426
|
+
}, WORKER_INTERVAL)
|
|
427
|
+
})
|
|
428
|
+
|
|
429
|
+
/** PROB PMX **/
|
|
430
|
+
Probe.metric({
|
|
431
|
+
name: 'Zombie Restarts',
|
|
432
|
+
value: () => {
|
|
433
|
+
const res = []
|
|
434
|
+
|
|
435
|
+
for (const [k, v] of zombieRestartHistory) {
|
|
436
|
+
if (v > 0) {
|
|
437
|
+
res.push([k, v])
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
if (!res.length) return 'N/A'
|
|
442
|
+
|
|
443
|
+
return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
|
|
444
|
+
},
|
|
445
|
+
})
|
|
446
|
+
|
|
447
|
+
Probe.metric({
|
|
448
|
+
name: 'Zombie Restarts (failed)',
|
|
449
|
+
value: () => {
|
|
450
|
+
const res = []
|
|
451
|
+
|
|
452
|
+
for (const [k, v] of zombieRestartFailedHistory) {
|
|
453
|
+
if (v > 0) {
|
|
454
|
+
res.push([k, v])
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
if (!res.length) return 'N/A'
|
|
459
|
+
|
|
460
|
+
return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
|
|
461
|
+
},
|
|
462
|
+
})
|
|
463
|
+
|
|
464
|
+
Probe.metric({
|
|
465
|
+
name: 'CPU Overload Restarts',
|
|
466
|
+
value: () => {
|
|
467
|
+
const res = []
|
|
468
|
+
|
|
469
|
+
for (const [k, v] of cpuOverloadRestartHistory) {
|
|
470
|
+
if (v > 0) {
|
|
471
|
+
res.push([k, v])
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
if (!res.length) return 'N/A'
|
|
476
|
+
|
|
477
|
+
return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
|
|
478
|
+
},
|
|
479
|
+
})
|
|
480
|
+
|
|
481
|
+
Probe.metric({
|
|
482
|
+
name: 'CPU Overload Restarts (failed)',
|
|
483
|
+
value: () => {
|
|
484
|
+
const res = []
|
|
485
|
+
|
|
486
|
+
for (const [k, v] of cpuOverloadRestartFailedHistory) {
|
|
487
|
+
if (v > 0) {
|
|
488
|
+
res.push([k, v])
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
if (!res.length) return 'N/A'
|
|
493
|
+
|
|
494
|
+
return res.map((v) => `[${v[0]}]:${v[1]}`).join(';')
|
|
495
|
+
},
|
|
496
|
+
})
|
|
497
|
+
|
|
498
|
+
Probe.metric({
|
|
499
|
+
name: 'Processes in Sampling (perf)',
|
|
500
|
+
value: () => {
|
|
501
|
+
const res = []
|
|
502
|
+
|
|
503
|
+
for (const [k, v] of perfSamplingStats) {
|
|
504
|
+
if (v === true) {
|
|
505
|
+
res.push(k)
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
if (!res.length) return 'N/A'
|
|
510
|
+
|
|
511
|
+
return res.join(', ')
|
|
512
|
+
},
|
|
513
|
+
})
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
runModule()
|