pm2-perfmonitor 1.2.3 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -11
- package/lib/app.js +240 -50
- package/lib/defaults.js +50 -0
- package/lib/message.js +25 -12
- package/lib/perf-sampler.js +247 -0
- package/lib/pm2-extra.js +17 -0
- package/lib/utils.js +10 -0
- package/package.json +4 -2
package/README.md
CHANGED
|
@@ -5,28 +5,44 @@ A pm2 module for performance monitor.
|
|
|
5
5
|
# Features
|
|
6
6
|
|
|
7
7
|
- Automatically detect **zombie** processes and restart it.
|
|
8
|
-
- Monitor the number of zombie process restarts (pm2 monit).
|
|
8
|
+
- Monitor the number of zombie process restarts (`pm2 monit`).
|
|
9
|
+
- [Added in v2] Support **CPU overload** protection (automatic restart + `perf` collection).
|
|
10
|
+
- [Added in v2] Monitor the number of CPU Overload process restarts (`pm2 monit`).
|
|
9
11
|
|
|
10
12
|
# Installation
|
|
11
13
|
|
|
12
14
|
```bash
|
|
15
|
+
# install or update
|
|
13
16
|
$ pm2 install pm2-perfmonitor
|
|
17
|
+
|
|
18
|
+
# uninstall
|
|
19
|
+
$ pm2 uninstall pm2-perfmonitor
|
|
14
20
|
```
|
|
15
21
|
|
|
16
22
|
> NOTE: the command is `pm2 install` NOT `npm install`
|
|
17
23
|
|
|
18
24
|
# Configure
|
|
19
25
|
|
|
20
|
-
| Property |
|
|
21
|
-
| :-----------------------------: |
|
|
22
|
-
| `enabled` |
|
|
23
|
-
| `excludeApps` |
|
|
24
|
-
| `includeApps` |
|
|
25
|
-
| `workerInterval` |
|
|
26
|
-
| `zombieDetection` |
|
|
27
|
-
| `zombieMaxHits` |
|
|
28
|
-
| `autoRestartWhenZombieDetected` |
|
|
29
|
-
| `zombieMaxRestarts` |
|
|
26
|
+
| Property | Defaults | Description | Supported |
|
|
27
|
+
| :-----------------------------: | :-----------------: | :-----------------------------------------------------------------------------------------------------: | :----------: |
|
|
28
|
+
| `enabled` | `true` | Specify whether to enable this module | v1 and above |
|
|
29
|
+
| `excludeApps` | - | Specify the application name that needs to be excluded from guardianship | v1 and above |
|
|
30
|
+
| `includeApps` | - | Specify the application name that needs to be guarded | v1 and above |
|
|
31
|
+
| `workerInterval` | `60000` | Timed task execution interval (ms) | v1 and above |
|
|
32
|
+
| `zombieDetection` | `true` | Specify whether to enable zombie process protection | v1 and above |
|
|
33
|
+
| `zombieMaxHits` | `10` | Specify the maximum occurrence frequency of zombie status | v1 and above |
|
|
34
|
+
| `autoRestartWhenZombieDetected` | `true` | Specify whether to automatically restart zombie processes | v1 and above |
|
|
35
|
+
| `zombieMaxRestarts` | `0` | Specify the maximum number of restarts for zombie processes (set to `0` to indicate no limit) | v1 and above |
|
|
36
|
+
| `cpuOverloadDetection` | `false` | Specify whether to enable CPU overload protection | v2 |
|
|
37
|
+
| `cpuOverloadThreshold` | `90` | Specify the threshold for determining CPU overload | v2 |
|
|
38
|
+
| `cpuOverloadMaxHits` | `5` | Maximum number of consecutive occurrences of CPU overload allowed (automatically restarts when reached) | v2 |
|
|
39
|
+
| `enableNodeInspectorCollection` | `false` | Specify whether to enable `node:inspector` performance collection | v2 |
|
|
40
|
+
| `nodeInspectorSampleDuration` | `10` | Specify the performance collection duration (s) for `node:inspector` | v2 |
|
|
41
|
+
| `enablePerfCollection` | `false` | Specify whether to enable `perf` performance collection | v2 |
|
|
42
|
+
| `perfReportGenerationDir` | `/var/log/pm2/perf` | Specify the directory for generating performance reports for `perf` | v2 |
|
|
43
|
+
| `flamegraphDir` | `/opt/FlameGraph` | Specify the directory for `flamegraph` flame map generation tool | v2 |
|
|
44
|
+
| `perfSampleDuration` | `10` | Specify the sampling duration (s) for `perf` | v2 |
|
|
45
|
+
| `perfSampleFrequency` | `99` | Specify the sampling frequency (Hz) for `perf` | v2 |
|
|
30
46
|
|
|
31
47
|
# How to set these values ?
|
|
32
48
|
|
|
@@ -36,3 +52,4 @@ After having installed the module you have to type : `pm2 set pm2-perfmonitor:<p
|
|
|
36
52
|
|
|
37
53
|
- `pm2 set pm2-perfmonitor:includeApps myNuxtApp1, myNextApp2` (Only detect applications named `myNuxtApp1` and `myNextApp2`)
|
|
38
54
|
- `pm2 set pm2-perfmonitor:workerInterval 120000` (Check every `2` minutes)
|
|
55
|
+
- `pm2 set pm2-perfmonitor:cpuOverloadDetection true`(enable **CPU overload** protection)
|
package/lib/app.js
CHANGED
|
@@ -1,9 +1,15 @@
|
|
|
1
1
|
const pmx = require('pmx')
|
|
2
2
|
const pm2 = require('pm2')
|
|
3
|
-
const { listAppsAsync } = require('./pm2-extra')
|
|
4
|
-
const {
|
|
3
|
+
const { listAppsAsync, restartAppAsync } = require('./pm2-extra')
|
|
4
|
+
const {
|
|
5
|
+
parseParamToArray,
|
|
6
|
+
parseParamToNumber,
|
|
7
|
+
parseBool,
|
|
8
|
+
sleepAsync,
|
|
9
|
+
} = require('./utils')
|
|
5
10
|
const { defaultOptions } = require('./defaults')
|
|
6
|
-
|
|
11
|
+
const { sendMessage } = require('./message')
|
|
12
|
+
const { performPerfSampling } = require('./perf-sampler')
|
|
7
13
|
|
|
8
14
|
const conf = pmx.initModule({}, (err, incomingConf) => {
|
|
9
15
|
if (err) {
|
|
@@ -30,10 +36,29 @@ const AUTO_RESTART_WHEN_ZOMBIE_DETECTED = parseBool(
|
|
|
30
36
|
const ZOMBIE_MAX_HITS = parseParamToNumber(conf.zombieMaxHits)
|
|
31
37
|
const ZOMBIE_MAX_RESTARTS = parseParamToNumber(conf.zombieMaxRestarts)
|
|
32
38
|
|
|
39
|
+
const cpuOverloadDetection = parseBool(conf.cpuOverloadDetection)
|
|
40
|
+
const cpuOverloadThreshold = parseParamToNumber(conf.cpuOverloadThreshold)
|
|
41
|
+
const cpuOverloadMaxHits = parseParamToNumber(conf.cpuOverloadMaxHits)
|
|
42
|
+
const enablePerfCollection = parseBool(conf.enablePerfCollection)
|
|
43
|
+
const perfReportGenerationDir = conf.perfReportGenerationDir
|
|
44
|
+
const flamegraphDir = conf.flamegraphDir
|
|
45
|
+
const perfSampleDuration = parseParamToNumber(conf.perfSampleDuration)
|
|
46
|
+
const perfSampleFrequency = parseParamToNumber(conf.perfSampleFrequency)
|
|
47
|
+
const enableNodeInspectorCollection = parseBool(
|
|
48
|
+
conf.enableNodeInspectorCollection,
|
|
49
|
+
)
|
|
50
|
+
const nodeInspectorSampleDuration = parseParamToNumber(
|
|
51
|
+
conf.nodeInspectorSampleDuration,
|
|
52
|
+
)
|
|
53
|
+
|
|
33
54
|
// 存储每个进程的 CPU 采样历史(pm_id -> [cpu1, cpu2, ...])
|
|
34
|
-
const
|
|
55
|
+
const zombieCpuHistory = new Map()
|
|
35
56
|
const zombieRestartHistory = new Map()
|
|
36
|
-
const
|
|
57
|
+
const zombieRestartFailedHistory = new Map()
|
|
58
|
+
|
|
59
|
+
const cpuOverloadHistory = new Map()
|
|
60
|
+
const cpuOverloadRestartHistory = new Map()
|
|
61
|
+
const cpuOverloadRestartFailedHistory = new Map()
|
|
37
62
|
|
|
38
63
|
/**
|
|
39
64
|
* @param {'log' | 'info' | 'error' | 'warn'} type
|
|
@@ -52,16 +77,90 @@ const isZombie = (history) => {
|
|
|
52
77
|
}
|
|
53
78
|
|
|
54
79
|
/**
|
|
55
|
-
*
|
|
80
|
+
* @param { number[] } history
|
|
81
|
+
*/
|
|
82
|
+
const isCpuOverload = (history) => {
|
|
83
|
+
return (
|
|
84
|
+
history.length >= cpuOverloadMaxHits &&
|
|
85
|
+
history.every((v) => v >= cpuOverloadThreshold)
|
|
86
|
+
)
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* @param { number } pm_id
|
|
91
|
+
* @param { number } appCpuUsage
|
|
92
|
+
* @returns { number[] } 对应 pm_id 的 CPU 使用率数组
|
|
93
|
+
*/
|
|
94
|
+
const setZombieCpuHistory = (pm_id, appCpuUsage) => {
|
|
95
|
+
if (!zombieCpuHistory.has(pm_id)) {
|
|
96
|
+
zombieCpuHistory.set(pm_id, [])
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const history = zombieCpuHistory.get(pm_id)
|
|
100
|
+
|
|
101
|
+
history.push(appCpuUsage)
|
|
102
|
+
|
|
103
|
+
// 只保留最近 ZOMBIE_MAX_HITS 次记录
|
|
104
|
+
if (history.length > ZOMBIE_MAX_HITS) {
|
|
105
|
+
history.shift()
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return history
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* @param { number } pm_id
|
|
113
|
+
* @param { number } appCpuUsage
|
|
114
|
+
* @returns { number[] } 对应 pm_id 的 CPU 使用率数组
|
|
56
115
|
*/
|
|
57
|
-
const
|
|
58
|
-
if (!
|
|
116
|
+
const setCpuOverloadHistory = (pm_id, appCpuUsage) => {
|
|
117
|
+
if (!cpuOverloadHistory.has(pm_id)) {
|
|
118
|
+
cpuOverloadHistory.set(pm_id, [])
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const history = cpuOverloadHistory.get(pm_id)
|
|
122
|
+
|
|
123
|
+
history.push(appCpuUsage)
|
|
124
|
+
|
|
125
|
+
// 只保留最近 x 次记录
|
|
126
|
+
if (history.length > cpuOverloadMaxHits) {
|
|
127
|
+
history.shift()
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return history
|
|
131
|
+
}
|
|
59
132
|
|
|
133
|
+
const setZombieRestartFailedHistory = (pm_id) => {
|
|
134
|
+
if (!zombieRestartFailedHistory.has(pm_id)) {
|
|
135
|
+
zombieRestartFailedHistory.set(pm_id, 1)
|
|
136
|
+
} else {
|
|
137
|
+
zombieRestartFailedHistory.set(
|
|
138
|
+
pm_id,
|
|
139
|
+
zombieRestartFailedHistory.get(pm_id) + 1,
|
|
140
|
+
)
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
const setCpuOverloadRestartFailedHistory = (pm_id) => {
|
|
145
|
+
if (!cpuOverloadRestartFailedHistory.has(pm_id)) {
|
|
146
|
+
cpuOverloadRestartFailedHistory.set(pm_id, 1)
|
|
147
|
+
} else {
|
|
148
|
+
cpuOverloadRestartFailedHistory.set(
|
|
149
|
+
pm_id,
|
|
150
|
+
cpuOverloadRestartFailedHistory.get(pm_id) + 1,
|
|
151
|
+
)
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* check process
|
|
157
|
+
*/
|
|
158
|
+
const processChecker = async () => {
|
|
60
159
|
try {
|
|
61
160
|
const apps = await listAppsAsync()
|
|
62
161
|
|
|
63
|
-
|
|
64
|
-
const { name, pm_id, monit, pm2_env } = app
|
|
162
|
+
for (const app of apps) {
|
|
163
|
+
const { name, pm_id, monit, pm2_env, pid } = app
|
|
65
164
|
|
|
66
165
|
const appStatus = pm2_env?.status
|
|
67
166
|
const appCpuUsage = monit?.cpu || 0
|
|
@@ -72,31 +171,32 @@ const zombieProcessChecker = async () => {
|
|
|
72
171
|
(INCLUDE_APPS.length > 0 && !INCLUDE_APPS.includes(name)) ||
|
|
73
172
|
(EXCLUDE_APPS.length > 0 && EXCLUDE_APPS.includes(name))
|
|
74
173
|
) {
|
|
75
|
-
|
|
174
|
+
continue
|
|
76
175
|
}
|
|
77
176
|
|
|
78
177
|
// 只处理 online 状态的进程
|
|
79
178
|
if (appStatus !== 'online') {
|
|
80
179
|
// 进程不在 online 状态时,清空其历史记录,避免干扰
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
}
|
|
180
|
+
zombieCpuHistory.delete(pm_id)
|
|
181
|
+
cpuOverloadHistory.delete(pm_id)
|
|
84
182
|
|
|
85
|
-
|
|
86
|
-
cpuHistory.set(pm_id, [])
|
|
183
|
+
continue
|
|
87
184
|
}
|
|
88
185
|
|
|
89
|
-
const history =
|
|
90
|
-
|
|
91
|
-
history.push(appCpuUsage)
|
|
186
|
+
const history = setZombieCpuHistory(pm_id, appCpuUsage)
|
|
187
|
+
const history2 = setCpuOverloadHistory(pm_id, appCpuUsage)
|
|
92
188
|
|
|
93
|
-
//
|
|
94
|
-
if (
|
|
95
|
-
|
|
189
|
+
// 发送消息通知对应应用进程,采样 CPU 性能
|
|
190
|
+
if (enableNodeInspectorCollection) {
|
|
191
|
+
if (appCpuUsage >= cpuOverloadThreshold) {
|
|
192
|
+
await sendMessage(pm_id, 'cpu-profile-start')
|
|
193
|
+
await sleepAsync(nodeInspectorSampleDuration * 1000)
|
|
194
|
+
await sendMessage(pm_id, 'cpu-profile-stop')
|
|
195
|
+
}
|
|
96
196
|
}
|
|
97
197
|
|
|
98
198
|
// 判断是否为僵尸:最近 ZOMBIE_MAX_HITS 次全是 0%
|
|
99
|
-
if (isZombie(history)) {
|
|
199
|
+
if (ZOMBIE_DETECTION && isZombie(history)) {
|
|
100
200
|
logger(
|
|
101
201
|
'info',
|
|
102
202
|
`Zombie detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
|
|
@@ -107,30 +207,13 @@ const zombieProcessChecker = async () => {
|
|
|
107
207
|
ZOMBIE_MAX_RESTARTS > 0 &&
|
|
108
208
|
zombieRestartHistory.get(pm_id) >= ZOMBIE_MAX_RESTARTS
|
|
109
209
|
) {
|
|
110
|
-
|
|
210
|
+
continue
|
|
111
211
|
}
|
|
112
212
|
|
|
113
213
|
logger('info', 'restarting...')
|
|
114
214
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
logger(
|
|
118
|
-
'error',
|
|
119
|
-
`Restart failed for ${name} (pm_id: ${pm_id}):`,
|
|
120
|
-
restartErr,
|
|
121
|
-
)
|
|
122
|
-
|
|
123
|
-
if (!restartFailedHistory.has(pm_id)) {
|
|
124
|
-
restartFailedHistory.set(pm_id, 1)
|
|
125
|
-
} else {
|
|
126
|
-
restartFailedHistory.set(
|
|
127
|
-
pm_id,
|
|
128
|
-
restartFailedHistory.get(pm_id) + 1,
|
|
129
|
-
)
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
return
|
|
133
|
-
}
|
|
215
|
+
try {
|
|
216
|
+
await restartAppAsync(pm_id)
|
|
134
217
|
|
|
135
218
|
if (!zombieRestartHistory.has(pm_id)) {
|
|
136
219
|
zombieRestartHistory.set(pm_id, 1)
|
|
@@ -142,15 +225,71 @@ const zombieProcessChecker = async () => {
|
|
|
142
225
|
|
|
143
226
|
logger(
|
|
144
227
|
'info',
|
|
145
|
-
`Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${zombieRestartHistory.get(pm_id)} times`,
|
|
228
|
+
`[ZOMBIE] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${zombieRestartHistory.get(pm_id)} times`,
|
|
146
229
|
)
|
|
147
230
|
|
|
148
231
|
// 重启后清除该进程的历史记录,避免刚重启又被判定为僵尸
|
|
149
|
-
|
|
232
|
+
zombieCpuHistory.delete(pm_id)
|
|
233
|
+
} catch (restartErr) {
|
|
234
|
+
logger(
|
|
235
|
+
'error',
|
|
236
|
+
`[ZOMBIE] Restart failed for ${name} (pm_id: ${pm_id}):`,
|
|
237
|
+
restartErr,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
setZombieRestartFailedHistory(pm_id)
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
// CPU 是否持续过载
|
|
245
|
+
else if (cpuOverloadDetection && isCpuOverload(history2)) {
|
|
246
|
+
logger(
|
|
247
|
+
'info',
|
|
248
|
+
`CPU Overload detected: ${name} (pm_id: ${pm_id}, pid: ${app.pid})`,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
if (enablePerfCollection) {
|
|
252
|
+
await performPerfSampling({
|
|
253
|
+
pid,
|
|
254
|
+
moduleName: MODULE_NAME,
|
|
255
|
+
perfDir: perfReportGenerationDir,
|
|
256
|
+
flamegraphDir,
|
|
257
|
+
sampleDuration: perfSampleDuration,
|
|
258
|
+
sampleFrequency: perfSampleFrequency,
|
|
150
259
|
})
|
|
151
260
|
}
|
|
261
|
+
|
|
262
|
+
try {
|
|
263
|
+
logger('info', 'restarting...')
|
|
264
|
+
|
|
265
|
+
await restartAppAsync(pm_id)
|
|
266
|
+
|
|
267
|
+
if (!cpuOverloadRestartHistory.has(pm_id)) {
|
|
268
|
+
cpuOverloadRestartHistory.set(pm_id, 1)
|
|
269
|
+
} else {
|
|
270
|
+
cpuOverloadRestartHistory.set(
|
|
271
|
+
pm_id,
|
|
272
|
+
cpuOverloadRestartHistory.get(pm_id) + 1,
|
|
273
|
+
)
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
logger(
|
|
277
|
+
'info',
|
|
278
|
+
`[CPU OVERLOAD] Restarted ${name} (pm_id: ${pm_id}) successfully!!! Restarted ${cpuOverloadRestartHistory.get(pm_id)} times`,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
cpuOverloadHistory.delete(pm_id)
|
|
282
|
+
} catch (restartErr) {
|
|
283
|
+
logger(
|
|
284
|
+
'error',
|
|
285
|
+
`[CPU OVERLOAD] Restart failed for ${name} (pm_id: ${pm_id}):`,
|
|
286
|
+
restartErr,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
setCpuOverloadRestartFailedHistory(pm_id)
|
|
290
|
+
}
|
|
152
291
|
}
|
|
153
|
-
}
|
|
292
|
+
}
|
|
154
293
|
} catch (err) {
|
|
155
294
|
logger('error', err)
|
|
156
295
|
}
|
|
@@ -169,10 +308,10 @@ const runModule = () => {
|
|
|
169
308
|
|
|
170
309
|
logger('info', 'Connected to PM2, starting monitor...')
|
|
171
310
|
|
|
172
|
-
|
|
311
|
+
processChecker()
|
|
173
312
|
|
|
174
313
|
setInterval(() => {
|
|
175
|
-
|
|
314
|
+
processChecker()
|
|
176
315
|
}, WORKER_INTERVAL)
|
|
177
316
|
})
|
|
178
317
|
|
|
@@ -199,7 +338,7 @@ const runModule = () => {
|
|
|
199
338
|
value: () => {
|
|
200
339
|
const res = []
|
|
201
340
|
|
|
202
|
-
for (const [k, v] of
|
|
341
|
+
for (const [k, v] of zombieRestartFailedHistory) {
|
|
203
342
|
if (v > 0) {
|
|
204
343
|
res.push([k, v])
|
|
205
344
|
}
|
|
@@ -216,7 +355,7 @@ const runModule = () => {
|
|
|
216
355
|
value: () => {
|
|
217
356
|
const res = []
|
|
218
357
|
|
|
219
|
-
for (const [pmId, arr] of
|
|
358
|
+
for (const [pmId, arr] of zombieCpuHistory) {
|
|
220
359
|
if (isZombie(arr)) {
|
|
221
360
|
res.push(pmId)
|
|
222
361
|
}
|
|
@@ -227,6 +366,57 @@ const runModule = () => {
|
|
|
227
366
|
return res.join(',')
|
|
228
367
|
},
|
|
229
368
|
})
|
|
369
|
+
|
|
370
|
+
Probe.metric({
|
|
371
|
+
name: 'CPU Overload Restarts',
|
|
372
|
+
value: () => {
|
|
373
|
+
const res = []
|
|
374
|
+
|
|
375
|
+
for (const [k, v] of cpuOverloadRestartHistory) {
|
|
376
|
+
if (v > 0) {
|
|
377
|
+
res.push([k, v])
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
if (!res.length) return 'N/A'
|
|
382
|
+
|
|
383
|
+
return res.map((v) => `[${v[0]}]:${v[1]}`).join(' ; ')
|
|
384
|
+
},
|
|
385
|
+
})
|
|
386
|
+
|
|
387
|
+
Probe.metric({
|
|
388
|
+
name: 'CPU Overload Processes',
|
|
389
|
+
value: () => {
|
|
390
|
+
const res = []
|
|
391
|
+
|
|
392
|
+
for (const [pmId, arr] of cpuOverloadHistory) {
|
|
393
|
+
if (isCpuOverload(arr)) {
|
|
394
|
+
res.push(pmId)
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
if (!res.length) return 'N/A'
|
|
399
|
+
|
|
400
|
+
return res.join(',')
|
|
401
|
+
},
|
|
402
|
+
})
|
|
403
|
+
|
|
404
|
+
Probe.metric({
|
|
405
|
+
name: 'CPU Overload Restarts (failed)',
|
|
406
|
+
value: () => {
|
|
407
|
+
const res = []
|
|
408
|
+
|
|
409
|
+
for (const [k, v] of cpuOverloadRestartFailedHistory) {
|
|
410
|
+
if (v > 0) {
|
|
411
|
+
res.push([k, v])
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
if (!res.length) return 'N/A'
|
|
416
|
+
|
|
417
|
+
return res.map((v) => `[${v[0]}]:${v[1]}`).join(' ; ')
|
|
418
|
+
},
|
|
419
|
+
})
|
|
230
420
|
}
|
|
231
421
|
|
|
232
422
|
runModule()
|
package/lib/defaults.js
CHANGED
|
@@ -28,6 +28,56 @@ const defaultOptions = {
|
|
|
28
28
|
* 僵尸进程最大重启次数,设置为0表示不限制
|
|
29
29
|
*/
|
|
30
30
|
zombieMaxRestarts: 0,
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* 是否开启 CPU 过载守护
|
|
34
|
+
*/
|
|
35
|
+
cpuOverloadDetection: false,
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* 判定 CPU 过载阈值
|
|
39
|
+
*/
|
|
40
|
+
cpuOverloadThreshold: 90,
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* 允许 CPU 过载最大连续出现次数,达到时自动重启
|
|
44
|
+
*/
|
|
45
|
+
cpuOverloadMaxHits: 5,
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* 是否开启 perf 性能采集
|
|
49
|
+
*/
|
|
50
|
+
enablePerfCollection: false,
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* 性能报告生成目录
|
|
54
|
+
*/
|
|
55
|
+
perfReportGenerationDir: '/var/log/pm2/perf',
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* flamegraph 火焰图生成工具目录
|
|
59
|
+
*/
|
|
60
|
+
flamegraphDir: '/opt/FlameGraph',
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* perf 采样持续时间 (s)
|
|
64
|
+
*/
|
|
65
|
+
perfSampleDuration: 10,
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* perf 采样频率 (Hz)
|
|
69
|
+
*/
|
|
70
|
+
perfSampleFrequency: 99,
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* 是否开启 node:inspector 性能采集
|
|
74
|
+
*/
|
|
75
|
+
enableNodeInspectorCollection: false,
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* node:inspector 性能采集持续时间 (s)
|
|
79
|
+
*/
|
|
80
|
+
nodeInspectorSampleDuration: 10,
|
|
31
81
|
}
|
|
32
82
|
|
|
33
83
|
module.exports = {
|
package/lib/message.js
CHANGED
|
@@ -1,19 +1,32 @@
|
|
|
1
1
|
const pm2 = require('pm2')
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* @param { number }
|
|
5
|
-
* @param { string } eventName 事件名
|
|
6
|
-
* @param {
|
|
4
|
+
* @param { number } pm_id - pm2 应用id
|
|
5
|
+
* @param { string } eventName - 事件名
|
|
6
|
+
* @param { object } [data] - 发送的数据
|
|
7
|
+
* @returns { Promise<void> }
|
|
7
8
|
*/
|
|
8
|
-
const sendMessage = (
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
9
|
+
const sendMessage = (pm_id, eventName, data) => {
|
|
10
|
+
return new Promise((resolve, reject) => {
|
|
11
|
+
pm2.sendDataToProcessId(
|
|
12
|
+
pm_id,
|
|
13
|
+
{
|
|
14
|
+
id: pm_id,
|
|
15
|
+
type: 'process:msg',
|
|
16
|
+
topic: true,
|
|
17
|
+
data: {
|
|
18
|
+
event: `pm2-perfmonitor:${eventName}`,
|
|
19
|
+
data,
|
|
20
|
+
},
|
|
21
|
+
},
|
|
22
|
+
(err) => {
|
|
23
|
+
if (err) {
|
|
24
|
+
return reject(err)
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
resolve()
|
|
28
|
+
},
|
|
29
|
+
)
|
|
17
30
|
})
|
|
18
31
|
}
|
|
19
32
|
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
const fs = require('fs-extra')
|
|
2
|
+
const path = require('path')
|
|
3
|
+
|
|
4
|
+
let execaCommandCache
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* 获取 execa 函数(缓存)
|
|
8
|
+
* @returns { import('execa')['execa'] }
|
|
9
|
+
*/
|
|
10
|
+
const getExeca = async () => {
|
|
11
|
+
if (!execaCommandCache) {
|
|
12
|
+
const execaModule = await import('execa')
|
|
13
|
+
execaCommandCache = execaModule.execa
|
|
14
|
+
}
|
|
15
|
+
return execaCommandCache
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* 执行命令(不通过 shell,直接使用参数数组)
|
|
20
|
+
* @param {string} cmd - 命令名称
|
|
21
|
+
* @param {string[]} args - 参数列表
|
|
22
|
+
* @param {object} options - execa 选项
|
|
23
|
+
* @returns {Promise<boolean>} 是否成功
|
|
24
|
+
*/
|
|
25
|
+
const execCommand = async (cmd, args, options = {}) => {
|
|
26
|
+
try {
|
|
27
|
+
const execa = await getExeca()
|
|
28
|
+
await execa(cmd, args, {
|
|
29
|
+
stdio: 'inherit',
|
|
30
|
+
...options,
|
|
31
|
+
})
|
|
32
|
+
return true
|
|
33
|
+
} catch (err) {
|
|
34
|
+
console.error(`Command failed: ${cmd} ${args.join(' ')}`, err.message)
|
|
35
|
+
return false
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* 生成安全的文件时间戳(不依赖区域)
|
|
41
|
+
*/
|
|
42
|
+
const getSafeTimestamp = () => {
|
|
43
|
+
const now = new Date()
|
|
44
|
+
const y = now.getFullYear()
|
|
45
|
+
const m = String(now.getMonth() + 1).padStart(2, '0')
|
|
46
|
+
const d = String(now.getDate()).padStart(2, '0')
|
|
47
|
+
const h = String(now.getHours()).padStart(2, '0')
|
|
48
|
+
const min = String(now.getMinutes()).padStart(2, '0')
|
|
49
|
+
const s = String(now.getSeconds()).padStart(2, '0')
|
|
50
|
+
return `${y}${m}${d}_${h}${min}${s}`
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* 执行 Perf 采样并生成火焰图
|
|
55
|
+
* @param {Object} options - 配置项
|
|
56
|
+
* @param {number} options.pid - 进程 PID
|
|
57
|
+
* @param {string} options.moduleName - 模块名(用于日志前缀)
|
|
58
|
+
* @param {string} options.perfDir - Perf 文件存储目录(当未提供 perfDataFile 时用于生成默认路径)
|
|
59
|
+
* @param {string} options.flamegraphDir - 火焰图工具目录
|
|
60
|
+
* @param {number} [options.sampleDuration=10] - 采样时长(秒)
|
|
61
|
+
* @param {number} [options.sampleFrequency=99] - 采样频率(Hz)
|
|
62
|
+
* @param {string} [options.perfDataFile] - 自定义 perf 数据文件路径(若未提供则自动生成)
|
|
63
|
+
* @param {boolean} [options.keepPerfData=false] - 是否保留原始 perf 数据文件(默认 false,即采样后删除)
|
|
64
|
+
*/
|
|
65
|
+
const performPerfSampling = async ({
|
|
66
|
+
pid,
|
|
67
|
+
moduleName,
|
|
68
|
+
perfDir,
|
|
69
|
+
flamegraphDir,
|
|
70
|
+
sampleDuration = 10,
|
|
71
|
+
sampleFrequency = 99,
|
|
72
|
+
perfDataFile: customPerfDataFile,
|
|
73
|
+
keepPerfData = false,
|
|
74
|
+
}) => {
|
|
75
|
+
const logger = (type, ...args) => {
|
|
76
|
+
console[type](`[${moduleName}]`, ...args)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// --- 参数校验 ---
|
|
80
|
+
if (!perfDir) {
|
|
81
|
+
logger('error', 'perfDir cannot be empty')
|
|
82
|
+
return
|
|
83
|
+
}
|
|
84
|
+
if (!flamegraphDir) {
|
|
85
|
+
logger('error', 'flamegraphDir cannot be empty')
|
|
86
|
+
return
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// PID 必须为数字且为正整数
|
|
90
|
+
const pidNum = Number(pid)
|
|
91
|
+
if (!Number.isInteger(pidNum) || pidNum <= 0) {
|
|
92
|
+
logger('error', `Invalid PID: ${pid} – must be a positive integer`)
|
|
93
|
+
return
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const finalDuration =
|
|
97
|
+
typeof sampleDuration === 'number' && sampleDuration > 0
|
|
98
|
+
? sampleDuration
|
|
99
|
+
: 10
|
|
100
|
+
const finalFrequency =
|
|
101
|
+
typeof sampleFrequency === 'number' && sampleFrequency > 0
|
|
102
|
+
? sampleFrequency
|
|
103
|
+
: 99
|
|
104
|
+
|
|
105
|
+
// 确保 perf 目录存在(用于默认路径,或自定义路径的父目录)
|
|
106
|
+
try {
|
|
107
|
+
await fs.ensureDir(perfDir)
|
|
108
|
+
logger('info', `Perf directory ready: ${perfDir}`)
|
|
109
|
+
} catch (err) {
|
|
110
|
+
logger('error', `Failed to create perf directory: ${err.message}`)
|
|
111
|
+
return
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// 检查 perf 权限
|
|
115
|
+
try {
|
|
116
|
+
const execa = await getExeca()
|
|
117
|
+
|
|
118
|
+
await execa('perf', ['--version'], { timeout: 5000 })
|
|
119
|
+
|
|
120
|
+
logger('info', 'Perf permission check passed')
|
|
121
|
+
} catch (err) {
|
|
122
|
+
logger('error', `Perf permission check failed: ${err.message}`)
|
|
123
|
+
logger(
|
|
124
|
+
'error',
|
|
125
|
+
`Please ensure the perf command is installed and the user has permission to run it.\n
|
|
126
|
+
You can configure the system to allow non-root perf by setting:\n
|
|
127
|
+
"echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid"\n
|
|
128
|
+
"sudo setcap cap_sys_admin+ep $(which perf)"`,
|
|
129
|
+
)
|
|
130
|
+
return // 无权限则直接退出
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// 生成时间戳(仅当需要默认路径时)
|
|
134
|
+
const timestamp = getSafeTimestamp()
|
|
135
|
+
|
|
136
|
+
// 确定 perf 数据文件路径
|
|
137
|
+
let perfDataFile
|
|
138
|
+
if (customPerfDataFile) {
|
|
139
|
+
perfDataFile = customPerfDataFile
|
|
140
|
+
// 确保自定义路径的父目录存在
|
|
141
|
+
const parentDir = path.dirname(perfDataFile)
|
|
142
|
+
try {
|
|
143
|
+
await fs.ensureDir(parentDir)
|
|
144
|
+
} catch (err) {
|
|
145
|
+
logger(
|
|
146
|
+
'error',
|
|
147
|
+
`Failed to create directory for custom perfDataFile: ${err.message}`,
|
|
148
|
+
)
|
|
149
|
+
return
|
|
150
|
+
}
|
|
151
|
+
} else {
|
|
152
|
+
perfDataFile = path.join(perfDir, `perf.${pidNum}.${timestamp}.data`)
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// 定义其他文件路径(基于 perfDir 和时间戳,与 perfDataFile 解耦)
|
|
156
|
+
const perfStacksFile = path.join(
|
|
157
|
+
perfDir,
|
|
158
|
+
`perf.${pidNum}.${timestamp}.stacks`,
|
|
159
|
+
)
|
|
160
|
+
const perfFoldedFile = path.join(
|
|
161
|
+
perfDir,
|
|
162
|
+
`perf.${pidNum}.${timestamp}.folded`,
|
|
163
|
+
)
|
|
164
|
+
const perfSvgFile = path.join(perfDir, `perf.${pidNum}.${timestamp}.svg`)
|
|
165
|
+
|
|
166
|
+
try {
|
|
167
|
+
logger(
|
|
168
|
+
'info',
|
|
169
|
+
`PID:${pidNum} Starting perf sampling (${finalDuration}s, ${finalFrequency}Hz)`,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
// --- Step 1: perf record ---
|
|
173
|
+
const recordOk = await execCommand('perf', [
|
|
174
|
+
'record',
|
|
175
|
+
'-o',
|
|
176
|
+
perfDataFile,
|
|
177
|
+
'-F',
|
|
178
|
+
String(finalFrequency),
|
|
179
|
+
'-p',
|
|
180
|
+
String(pidNum),
|
|
181
|
+
'-g',
|
|
182
|
+
'--',
|
|
183
|
+
'sleep',
|
|
184
|
+
String(finalDuration),
|
|
185
|
+
])
|
|
186
|
+
if (!recordOk) return
|
|
187
|
+
|
|
188
|
+
// --- Step 2: perf script 导出为文本堆栈 ---
|
|
189
|
+
const scriptOk = await execCommand('perf', ['script', '-i', perfDataFile], {
|
|
190
|
+
stdout: fs.createWriteStream(perfStacksFile),
|
|
191
|
+
})
|
|
192
|
+
|
|
193
|
+
if (!scriptOk) return
|
|
194
|
+
|
|
195
|
+
logger('info', `PID:${pidNum} Perf sampling completed: ${perfStacksFile}`)
|
|
196
|
+
|
|
197
|
+
// 根据 keepPerfData 决定是否删除原始数据文件
|
|
198
|
+
if (!keepPerfData) {
|
|
199
|
+
await fs.remove(perfDataFile).catch(() => {})
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// --- Step 3: 检查火焰图工具 ---
|
|
203
|
+
const stackcollapsePath = path.join(flamegraphDir, 'stackcollapse-perf.pl')
|
|
204
|
+
const flamegraphPath = path.join(flamegraphDir, 'flamegraph.pl')
|
|
205
|
+
|
|
206
|
+
const isStackcollapseValid = await fs
|
|
207
|
+
.access(stackcollapsePath, fs.constants.X_OK)
|
|
208
|
+
.then(() => true)
|
|
209
|
+
.catch(() => false)
|
|
210
|
+
const isFlamegraphValid = await fs
|
|
211
|
+
.access(flamegraphPath, fs.constants.X_OK)
|
|
212
|
+
.then(() => true)
|
|
213
|
+
.catch(() => false)
|
|
214
|
+
|
|
215
|
+
if (isStackcollapseValid && isFlamegraphValid) {
|
|
216
|
+
// --- Step 4: 生成折叠文件 ---
|
|
217
|
+
const collapseOk = await execCommand(
|
|
218
|
+
stackcollapsePath,
|
|
219
|
+
[perfStacksFile],
|
|
220
|
+
{
|
|
221
|
+
stdout: fs.createWriteStream(perfFoldedFile),
|
|
222
|
+
},
|
|
223
|
+
)
|
|
224
|
+
if (!collapseOk) return
|
|
225
|
+
|
|
226
|
+
// --- Step 5: 生成 SVG 火焰图 ---
|
|
227
|
+
const flameOk = await execCommand(flamegraphPath, [perfFoldedFile], {
|
|
228
|
+
stdout: fs.createWriteStream(perfSvgFile),
|
|
229
|
+
})
|
|
230
|
+
if (flameOk) {
|
|
231
|
+
logger('info', `PID:${pidNum} Flame graph generated: ${perfSvgFile}`)
|
|
232
|
+
}
|
|
233
|
+
} else {
|
|
234
|
+
const missing = []
|
|
235
|
+
if (!isStackcollapseValid) missing.push('stackcollapse-perf.pl')
|
|
236
|
+
if (!isFlamegraphValid) missing.push('flamegraph.pl')
|
|
237
|
+
logger(
|
|
238
|
+
'info',
|
|
239
|
+
`PID:${pidNum} Skip flame graph – missing/not executable: ${missing.join(', ')}`,
|
|
240
|
+
)
|
|
241
|
+
}
|
|
242
|
+
} catch (err) {
|
|
243
|
+
logger('error', `PID:${pidNum} Perf sampling exception: ${err.message}`)
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
module.exports = { performPerfSampling }
|
package/lib/pm2-extra.js
CHANGED
|
@@ -31,7 +31,24 @@ const stopAppAsync = (pm_id) => {
|
|
|
31
31
|
})
|
|
32
32
|
}
|
|
33
33
|
|
|
34
|
+
/**
|
|
35
|
+
* @param { string | number} pm_id
|
|
36
|
+
* @returns { Promise<void> }
|
|
37
|
+
*/
|
|
38
|
+
const restartAppAsync = (pm_id) => {
|
|
39
|
+
return new Promise((resolve, reject) => {
|
|
40
|
+
pm2.restart(pm_id, (err) => {
|
|
41
|
+
if (err) {
|
|
42
|
+
return reject(err)
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
resolve()
|
|
46
|
+
})
|
|
47
|
+
})
|
|
48
|
+
}
|
|
49
|
+
|
|
34
50
|
module.exports = {
|
|
35
51
|
listAppsAsync,
|
|
36
52
|
stopAppAsync,
|
|
53
|
+
restartAppAsync,
|
|
37
54
|
}
|
package/lib/utils.js
CHANGED
|
@@ -28,8 +28,18 @@ const parseBool = (value, defaultVal = false) => {
|
|
|
28
28
|
return defaultVal
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
+
/**
|
|
32
|
+
* @param { number} duration - sleep duration (ms)
|
|
33
|
+
*/
|
|
34
|
+
const sleepAsync = (duration = 0) => {
|
|
35
|
+
return new Promise((resolve) => {
|
|
36
|
+
setTimeout(resolve, duration)
|
|
37
|
+
})
|
|
38
|
+
}
|
|
39
|
+
|
|
31
40
|
module.exports = {
|
|
32
41
|
parseParamToArray,
|
|
33
42
|
parseParamToNumber,
|
|
34
43
|
parseBool,
|
|
44
|
+
sleepAsync,
|
|
35
45
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pm2-perfmonitor",
|
|
3
|
-
"version": "1.2
|
|
3
|
+
"version": "2.1.2",
|
|
4
4
|
"description": "A pm2 module for performance monitoring. Automatically detect zombie processes and restart it",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "elenh",
|
|
@@ -42,6 +42,8 @@
|
|
|
42
42
|
],
|
|
43
43
|
"config": {},
|
|
44
44
|
"dependencies": {
|
|
45
|
+
"execa": "^9.6.1",
|
|
46
|
+
"fs-extra": "^11.3.4",
|
|
45
47
|
"pm2": "latest",
|
|
46
48
|
"pmx": "latest"
|
|
47
49
|
},
|
|
@@ -50,4 +52,4 @@
|
|
|
50
52
|
"cz-conventional-changelog": "^3.3.0",
|
|
51
53
|
"minimist": "^1.2.8"
|
|
52
54
|
}
|
|
53
|
-
}
|
|
55
|
+
}
|