pi-deadman 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +110 -0
- package/__tests__/calibration.test.ts +73 -0
- package/__tests__/canary.test.ts +68 -0
- package/__tests__/fast-watchdog.test.ts +188 -0
- package/__tests__/index.test.ts +103 -0
- package/__tests__/keywords.test.ts +130 -0
- package/__tests__/logging.test.ts +128 -0
- package/__tests__/monitor.test.ts +115 -0
- package/__tests__/processes.test.ts +74 -0
- package/__tests__/signals.test.ts +59 -0
- package/__tests__/tree.test.ts +327 -0
- package/__tests__/watchdog.test.ts +421 -0
- package/__tests__/worker.test.ts +85 -0
- package/__tests__/zones.test.ts +182 -0
- package/extensions/calibration.ts +62 -0
- package/extensions/canary.ts +51 -0
- package/extensions/index.ts +363 -0
- package/extensions/keywords.ts +77 -0
- package/extensions/logging.ts +82 -0
- package/extensions/monitor.ts +512 -0
- package/extensions/processes.ts +94 -0
- package/extensions/signals.ts +172 -0
- package/extensions/tree.ts +218 -0
- package/extensions/watchdog.ts +138 -0
- package/extensions/worker.ts +208 -0
- package/extensions/zones.ts +109 -0
- package/helpers/footprint.py +72 -0
- package/helpers/footprint_worker.py +214 -0
- package/package.json +24 -0
- package/tsconfig.json +17 -0
- package/vitest.config.ts +10 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
// signals.ts — macOS kernel memory metrics: sysctl, vm_stat, memory_pressure
|
|
2
|
+
|
|
3
|
+
import { execSync } from "child_process";
|
|
4
|
+
|
|
5
|
+
// Module-level state for rate computation
|
|
6
|
+
let previousVmStat: Record<string, number> | null = null;
|
|
7
|
+
let previousTimestamp: number | null = null;
|
|
8
|
+
|
|
9
|
+
export interface SystemSignals {
|
|
10
|
+
swapout_rate: number; // swapouts/sec delta from previous call, 0 on first call
|
|
11
|
+
swapin_rate: number; // swapins/sec delta
|
|
12
|
+
decomp_rate: number; // decompressions/sec delta
|
|
13
|
+
pressure_level: number; // 1, 2, or 4 from memory_pressure command
|
|
14
|
+
memorystatus_level: number; // 0-100 from sysctl kern.memorystatus_level
|
|
15
|
+
swap_used_mb: number; // from sysctl vm.swapusage
|
|
16
|
+
swap_free_mb: number; // from sysctl vm.swapusage
|
|
17
|
+
compression_ratio: number; // pages_stored / compressor_page_count, min 1.0
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export async function collectSignals(): Promise<SystemSignals> {
|
|
21
|
+
const timestamp = Date.now() / 1000; // Convert to seconds
|
|
22
|
+
|
|
23
|
+
// Initialize with safe defaults
|
|
24
|
+
let swapout_rate = 0;
|
|
25
|
+
let swapin_rate = 0;
|
|
26
|
+
let decomp_rate = 0;
|
|
27
|
+
let pressure_level = 1;
|
|
28
|
+
let memorystatus_level = 0;
|
|
29
|
+
let swap_used_mb = 0;
|
|
30
|
+
let swap_free_mb = 0;
|
|
31
|
+
let compression_ratio = 1.0;
|
|
32
|
+
|
|
33
|
+
// Get current vm_stat data
|
|
34
|
+
const currentVmStat = getVmStat();
|
|
35
|
+
|
|
36
|
+
// Compute delta-based rates if we have previous data
|
|
37
|
+
if (previousVmStat !== null && previousTimestamp !== null) {
|
|
38
|
+
const timeDelta = timestamp - previousTimestamp;
|
|
39
|
+
if (timeDelta > 0) {
|
|
40
|
+
// Swapout rate: pages written to swap per second
|
|
41
|
+
const prevPageouts = previousVmStat.swapouts || 0;
|
|
42
|
+
const currPageouts = currentVmStat.swapouts || 0;
|
|
43
|
+
swapout_rate = Math.max(0, (currPageouts - prevPageouts) / timeDelta);
|
|
44
|
+
|
|
45
|
+
// Swapin rate: pages read from swap per second (swapins)
|
|
46
|
+
const prevPageins = previousVmStat.swapins || 0;
|
|
47
|
+
const currPageins = currentVmStat.swapins || 0;
|
|
48
|
+
swapin_rate = Math.max(0, (currPageins - prevPageins) / timeDelta);
|
|
49
|
+
|
|
50
|
+
// Decompression rate: pages decompressed per second
|
|
51
|
+
const prevDecomp = previousVmStat.decompressions || 0;
|
|
52
|
+
const currDecomp = currentVmStat.decompressions || 0;
|
|
53
|
+
decomp_rate = Math.max(0, (currDecomp - prevDecomp) / timeDelta);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Store current state for next call
|
|
58
|
+
previousVmStat = currentVmStat;
|
|
59
|
+
previousTimestamp = timestamp;
|
|
60
|
+
|
|
61
|
+
// Get point-in-time signals
|
|
62
|
+
pressure_level = getPressureLevel();
|
|
63
|
+
memorystatus_level = getMemorystatusLevel();
|
|
64
|
+
[swap_used_mb, swap_free_mb] = getSwapUsage();
|
|
65
|
+
compression_ratio = getCompressionRatio(currentVmStat);
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
swapout_rate,
|
|
69
|
+
swapin_rate,
|
|
70
|
+
decomp_rate,
|
|
71
|
+
pressure_level,
|
|
72
|
+
memorystatus_level,
|
|
73
|
+
swap_used_mb,
|
|
74
|
+
swap_free_mb,
|
|
75
|
+
compression_ratio
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function getVmStat(): Record<string, number> {
|
|
80
|
+
try {
|
|
81
|
+
const output = execSync("vm_stat", { encoding: "utf8", timeout: 5000 });
|
|
82
|
+
const data: Record<string, number> = {};
|
|
83
|
+
|
|
84
|
+
for (const line of output.trim().split("\n")) {
|
|
85
|
+
if (line.includes(":")) {
|
|
86
|
+
const [key, valueStr] = line.split(":");
|
|
87
|
+
const value = valueStr.trim().replace(/\.$/, ""); // Remove trailing period
|
|
88
|
+
const numValue = parseInt(value, 10);
|
|
89
|
+
if (!isNaN(numValue)) {
|
|
90
|
+
data[key.trim().toLowerCase()] = numValue;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return data;
|
|
96
|
+
} catch {
|
|
97
|
+
return {};
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function getPressureLevel(): number {
|
|
102
|
+
try {
|
|
103
|
+
const output = execSync("sysctl -n kern.memorystatus_vm_pressure_level", {
|
|
104
|
+
encoding: "utf8",
|
|
105
|
+
timeout: 5000
|
|
106
|
+
});
|
|
107
|
+
const level = parseInt(output.trim(), 10);
|
|
108
|
+
if ([1, 2, 4].includes(level)) {
|
|
109
|
+
return level;
|
|
110
|
+
}
|
|
111
|
+
return 1; // Default to normal
|
|
112
|
+
} catch {
|
|
113
|
+
return 1;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function getMemorystatusLevel(): number {
|
|
118
|
+
try {
|
|
119
|
+
const output = execSync("sysctl -n kern.memorystatus_level", {
|
|
120
|
+
encoding: "utf8",
|
|
121
|
+
timeout: 5000
|
|
122
|
+
});
|
|
123
|
+
const level = parseInt(output.trim(), 10);
|
|
124
|
+
return isNaN(level) ? 0 : Math.max(0, Math.min(100, level));
|
|
125
|
+
} catch {
|
|
126
|
+
return 0;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
function getSwapUsage(): [number, number] {
|
|
131
|
+
try {
|
|
132
|
+
const output = execSync("sysctl -n vm.swapusage", {
|
|
133
|
+
encoding: "utf8",
|
|
134
|
+
timeout: 5000
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
let usedMb = 0;
|
|
138
|
+
let freeMb = 0;
|
|
139
|
+
|
|
140
|
+
// Parse "used = X.XXM free = Y.YYM"
|
|
141
|
+
const parts = output.trim().split(/\s+/);
|
|
142
|
+
for (let i = 0; i < parts.length - 2; i++) {
|
|
143
|
+
if (parts[i] === "used" && parts[i + 1] === "=") {
|
|
144
|
+
const value = parseFloat(parts[i + 2].replace("M", ""));
|
|
145
|
+
if (!isNaN(value)) usedMb = value;
|
|
146
|
+
} else if (parts[i] === "free" && parts[i + 1] === "=") {
|
|
147
|
+
const value = parseFloat(parts[i + 2].replace("M", ""));
|
|
148
|
+
if (!isNaN(value)) freeMb = value;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return [usedMb, freeMb];
|
|
153
|
+
} catch {
|
|
154
|
+
return [0, 0];
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function getCompressionRatio(vmData: Record<string, number>): number {
|
|
159
|
+
try {
|
|
160
|
+
const compressorPageCount = vmData["pages occupied by compressor"] || 0;
|
|
161
|
+
const pagesStoredInCompressor = vmData["pages stored in compressor"] || 0;
|
|
162
|
+
|
|
163
|
+
if (compressorPageCount > 0 && pagesStoredInCompressor > 0) {
|
|
164
|
+
const ratio = pagesStoredInCompressor / compressorPageCount;
|
|
165
|
+
return Math.max(1.0, ratio);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return 1.0;
|
|
169
|
+
} catch {
|
|
170
|
+
return 1.0;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
export interface SnapshotProcess {
|
|
2
|
+
pid: number;
|
|
3
|
+
name: string;
|
|
4
|
+
footprint_mb: number;
|
|
5
|
+
age_seconds: number | null;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export interface GrowingProcess extends SnapshotProcess {
|
|
9
|
+
delta_mb: number;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
// Format age in seconds to human-readable string
|
|
13
|
+
export function formatAge(seconds: number | null): string {
|
|
14
|
+
if (seconds === null) {
|
|
15
|
+
return "unknown";
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
if (seconds < 60) {
|
|
19
|
+
return `${seconds}s`;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
if (seconds < 3600) {
|
|
23
|
+
const minutes = Math.floor(seconds / 60);
|
|
24
|
+
const remainingSeconds = seconds % 60;
|
|
25
|
+
return `${minutes}m ${remainingSeconds}s`;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// >= 3600 (1 hour or more)
|
|
29
|
+
const hours = Math.floor(seconds / 3600);
|
|
30
|
+
const remainingMinutes = Math.floor((seconds % 3600) / 60);
|
|
31
|
+
return `${hours}h ${remainingMinutes}m`;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Parse ps etime format to seconds.
|
|
36
|
+
* Formats: SS, MM:SS, HH:MM:SS, DD-HH:MM:SS
|
|
37
|
+
* Returns null on parse failure.
|
|
38
|
+
*/
|
|
39
|
+
export function parseEtime(etime: string): number | null {
|
|
40
|
+
const trimmed = etime.trim();
|
|
41
|
+
if (!trimmed) return null;
|
|
42
|
+
|
|
43
|
+
try {
|
|
44
|
+
let days = 0;
|
|
45
|
+
let timePart = trimmed;
|
|
46
|
+
|
|
47
|
+
if (timePart.includes("-")) {
|
|
48
|
+
const [dayStr, rest] = timePart.split("-", 2);
|
|
49
|
+
days = parseInt(dayStr, 10);
|
|
50
|
+
if (isNaN(days)) return null;
|
|
51
|
+
timePart = rest;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const parts = timePart.split(":").map(p => parseInt(p, 10));
|
|
55
|
+
if (parts.some(isNaN)) return null;
|
|
56
|
+
|
|
57
|
+
let seconds = days * 86400;
|
|
58
|
+
if (parts.length === 3) {
|
|
59
|
+
seconds += parts[0] * 3600 + parts[1] * 60 + parts[2];
|
|
60
|
+
} else if (parts.length === 2) {
|
|
61
|
+
seconds += parts[0] * 60 + parts[1];
|
|
62
|
+
} else if (parts.length === 1) {
|
|
63
|
+
seconds += parts[0];
|
|
64
|
+
} else {
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
return seconds;
|
|
69
|
+
} catch {
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Natural language description of a process
|
|
75
|
+
export function describeProcess(proc: SnapshotProcess): string {
|
|
76
|
+
const age = formatAge(proc.age_seconds);
|
|
77
|
+
return `${proc.name} [PID ${proc.pid}] (${proc.footprint_mb} MB, running ${age})`;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Find processes in current that don't exist in previous (by PID)
|
|
81
|
+
export function findNewProcesses(
|
|
82
|
+
previous: SnapshotProcess[],
|
|
83
|
+
current: SnapshotProcess[],
|
|
84
|
+
): SnapshotProcess[] {
|
|
85
|
+
const previousPids = new Set(previous.map(p => p.pid));
|
|
86
|
+
return current.filter(proc => !previousPids.has(proc.pid));
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Find processes with sustained growth across a snapshot history.
|
|
91
|
+
*
|
|
92
|
+
* "Growing" = present in at least 3 snapshots, footprint increased between
|
|
93
|
+
* consecutive pairs in at least 3 of those transitions, AND total delta
|
|
94
|
+
* (newest - oldest observed footprint) >= thresholdMb.
|
|
95
|
+
*
|
|
96
|
+
* This is resilient to momentary plateaus — a process that grows, holds,
|
|
97
|
+
* then grows again still qualifies as long as 3+ transitions show increases.
|
|
98
|
+
*
|
|
99
|
+
* @param snapshotHistory - ring buffer of snapshots, oldest first (max ~10).
|
|
100
|
+
* @param currentChildren - current process list (used to filter to eligible PIDs only)
|
|
101
|
+
* @param thresholdMb - minimum total delta from oldest to newest (default 200)
|
|
102
|
+
* @param minGrowthIntervals - minimum number of intervals showing growth (default 3)
|
|
103
|
+
*/
|
|
104
|
+
export function findGrowingProcesses(
|
|
105
|
+
snapshotHistory: SnapshotProcess[][],
|
|
106
|
+
currentChildren: SnapshotProcess[],
|
|
107
|
+
thresholdMb: number = 200,
|
|
108
|
+
minGrowthIntervals: number = 3,
|
|
109
|
+
): GrowingProcess[] {
|
|
110
|
+
// Need at least 3 snapshots to have 2 intervals
|
|
111
|
+
if (snapshotHistory.length < 3) {
|
|
112
|
+
return [];
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const eligiblePids = new Set(currentChildren.map(p => p.pid));
|
|
116
|
+
const growing: GrowingProcess[] = [];
|
|
117
|
+
|
|
118
|
+
for (const pid of eligiblePids) {
|
|
119
|
+
// Collect footprint values for this PID across all snapshots (skip gaps)
|
|
120
|
+
const footprints: number[] = [];
|
|
121
|
+
for (const snapshot of snapshotHistory) {
|
|
122
|
+
const proc = snapshot.find(p => p.pid === pid);
|
|
123
|
+
if (proc) {
|
|
124
|
+
footprints.push(proc.footprint_mb);
|
|
125
|
+
}
|
|
126
|
+
// Missing from a snapshot is OK — we just skip it
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Need the PID present in at least 3 snapshots
|
|
130
|
+
if (footprints.length < 3) continue;
|
|
131
|
+
|
|
132
|
+
// Count intervals where footprint increased
|
|
133
|
+
let growthIntervals = 0;
|
|
134
|
+
for (let i = 1; i < footprints.length; i++) {
|
|
135
|
+
if (footprints[i] > footprints[i - 1]) {
|
|
136
|
+
growthIntervals++;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (growthIntervals < minGrowthIntervals) continue;
|
|
141
|
+
|
|
142
|
+
// Check total delta meets threshold (newest - oldest observed)
|
|
143
|
+
const totalDelta = footprints[footprints.length - 1] - footprints[0];
|
|
144
|
+
if (totalDelta < thresholdMb) continue;
|
|
145
|
+
|
|
146
|
+
const currentProc = currentChildren.find(p => p.pid === pid)!;
|
|
147
|
+
growing.push({
|
|
148
|
+
...currentProc,
|
|
149
|
+
delta_mb: totalDelta,
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Sort by total delta descending
|
|
154
|
+
return growing.sort((a, b) => b.delta_mb - a.delta_mb);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Find the largest group of processes with the same name (>= 2 members)
|
|
158
|
+
// Returns the group, or empty array if no group has >= 2 members
|
|
159
|
+
export function findSimilarGroup(processes: SnapshotProcess[]): SnapshotProcess[] {
|
|
160
|
+
// Group processes by name
|
|
161
|
+
const groups = new Map<string, SnapshotProcess[]>();
|
|
162
|
+
|
|
163
|
+
for (const proc of processes) {
|
|
164
|
+
if (!groups.has(proc.name)) {
|
|
165
|
+
groups.set(proc.name, []);
|
|
166
|
+
}
|
|
167
|
+
groups.get(proc.name)!.push(proc);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Find the largest group with >= 2 members
|
|
171
|
+
let largestGroup: SnapshotProcess[] = [];
|
|
172
|
+
|
|
173
|
+
for (const group of groups.values()) {
|
|
174
|
+
if (group.length >= 2 && group.length > largestGroup.length) {
|
|
175
|
+
largestGroup = group;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
return largestGroup;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* Find the heaviest swarm of same-name processes.
|
|
184
|
+
*
|
|
185
|
+
* A "swarm" is >= minCount processes with the same name whose combined
|
|
186
|
+
* footprint >= minCombinedMb. Returns the group with the highest combined
|
|
187
|
+
* footprint, sorted by individual footprint descending.
|
|
188
|
+
*
|
|
189
|
+
* Use case: 7 vitest workers each at 500 MB = 3.5 GB swarm.
|
|
190
|
+
*/
|
|
191
|
+
export function findSwarm(
|
|
192
|
+
processes: SnapshotProcess[],
|
|
193
|
+
minCount: number = 3,
|
|
194
|
+
minCombinedMb: number = 500,
|
|
195
|
+
): SnapshotProcess[] {
|
|
196
|
+
const groups = new Map<string, SnapshotProcess[]>();
|
|
197
|
+
|
|
198
|
+
for (const proc of processes) {
|
|
199
|
+
if (!groups.has(proc.name)) {
|
|
200
|
+
groups.set(proc.name, []);
|
|
201
|
+
}
|
|
202
|
+
groups.get(proc.name)!.push(proc);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
let bestGroup: SnapshotProcess[] = [];
|
|
206
|
+
let bestCombined = 0;
|
|
207
|
+
|
|
208
|
+
for (const group of groups.values()) {
|
|
209
|
+
if (group.length < minCount) continue;
|
|
210
|
+
const combined = group.reduce((sum, p) => sum + p.footprint_mb, 0);
|
|
211
|
+
if (combined >= minCombinedMb && combined > bestCombined) {
|
|
212
|
+
bestGroup = group;
|
|
213
|
+
bestCombined = combined;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
return bestGroup.sort((a, b) => b.footprint_mb - a.footprint_mb);
|
|
218
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import type { SnapshotProcess } from "./tree";
|
|
2
|
+
import { findGrowingProcesses, findSwarm } from "./tree";
|
|
3
|
+
|
|
4
|
+
export interface WatchdogState {
|
|
5
|
+
/** Ring buffer of recent snapshots (oldest first, max 10). Each entry is pi's child process list. */
|
|
6
|
+
snapshotHistory: SnapshotProcess[][];
|
|
7
|
+
/** Epoch seconds of the last snapshot taken while system was stably non-RED. */
|
|
8
|
+
lastNonRedTimestamp: number;
|
|
9
|
+
/** Count of consecutive non-RED polls. Used for hysteresis on lastNonRedTimestamp. */
|
|
10
|
+
consecutiveNonRedPolls: number;
|
|
11
|
+
lastKillTime: number; // Date.now() of last kill, 0 if never
|
|
12
|
+
cooldownMs: number; // minimum ms between kills
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface KillDecision {
|
|
16
|
+
targets: SnapshotProcess[]; // processes to kill
|
|
17
|
+
reason: string; // human-readable reason
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Should we auto-kill? Only on confirmed RED.
|
|
21
|
+
export function shouldAutoKill(zone: string, confirmed: boolean): boolean {
|
|
22
|
+
return zone === "RED" && confirmed === true;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/** Minimum footprint to be considered a kill candidate. Filters infrastructure noise. */
|
|
26
|
+
export const MIN_FOOTPRINT_MB = 50;
|
|
27
|
+
|
|
28
|
+
/** Thresholds for growth detection */
|
|
29
|
+
export const GROWTH_THRESHOLD_MB = 100;
|
|
30
|
+
|
|
31
|
+
/** Thresholds for swarm detection */
|
|
32
|
+
export const SWARM_MIN_COUNT = 3;
|
|
33
|
+
export const SWARM_MIN_COMBINED_MB = 500;
|
|
34
|
+
|
|
35
|
+
/** Thresholds for "heavy & young" detection */
|
|
36
|
+
export const HEAVY_YOUNG_MAX_AGE_SECONDS = 600; // 10 minutes
|
|
37
|
+
export const HEAVY_YOUNG_MIN_FOOTPRINT_MB = 200; // 200 MB (was 1 GB)
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Select what to kill from process list.
|
|
41
|
+
*
|
|
42
|
+
* Priority chain:
|
|
43
|
+
* Floor: processes with footprint < 50 MB are never kill candidates
|
|
44
|
+
* 1. Growing — sustained delta ≥100 MB across 3+ of last 10 snapshots → kill ALL
|
|
45
|
+
* 2. Swarm — ≥3 same-name processes, combined footprint ≥500 MB → kill ALL in swarm
|
|
46
|
+
* 3. Heavy & young — age < 10 min AND footprint ≥ 200 MB → kill ALL, largest first
|
|
47
|
+
* 4. Newest — appeared after last stable non-RED state → kill LARGEST ONLY (not batch)
|
|
48
|
+
* 5. No match → don't kill, block commands and wait
|
|
49
|
+
*
|
|
50
|
+
* Returns null if nothing should be killed (no evidence, cooldown, empty).
|
|
51
|
+
*/
|
|
52
|
+
export function selectKillTarget(
|
|
53
|
+
currentChildren: SnapshotProcess[],
|
|
54
|
+
state: WatchdogState,
|
|
55
|
+
maxAgeSeconds: number,
|
|
56
|
+
protectedPid?: number,
|
|
57
|
+
): KillDecision | null {
|
|
58
|
+
// 1. Check cooldown
|
|
59
|
+
if (state.lastKillTime > 0 && Date.now() - state.lastKillTime < state.cooldownMs) {
|
|
60
|
+
return null;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// 2. Apply footprint floor + age filter + protected PID
|
|
64
|
+
let eligible = currentChildren.filter(proc =>
|
|
65
|
+
proc.footprint_mb >= MIN_FOOTPRINT_MB &&
|
|
66
|
+
(proc.age_seconds === null || proc.age_seconds <= maxAgeSeconds)
|
|
67
|
+
);
|
|
68
|
+
|
|
69
|
+
if (protectedPid !== undefined) {
|
|
70
|
+
eligible = eligible.filter(proc => proc.pid !== protectedPid);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if (eligible.length === 0) {
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Priority 1: Growing — kill ALL processes with sustained growth
|
|
78
|
+
if (state.snapshotHistory.length >= 3) {
|
|
79
|
+
const growing = findGrowingProcesses(state.snapshotHistory, eligible, GROWTH_THRESHOLD_MB);
|
|
80
|
+
if (growing.length > 0) {
|
|
81
|
+
return {
|
|
82
|
+
targets: growing,
|
|
83
|
+
reason: `Killing ${growing.length} growing process(es): ${growing.map(p => `${p.name}(+${p.delta_mb}MB)`).join(", ")}`,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Priority 2: Swarm — kill ALL in the heaviest same-name cluster
|
|
89
|
+
const swarm = findSwarm(eligible, SWARM_MIN_COUNT, SWARM_MIN_COMBINED_MB);
|
|
90
|
+
if (swarm.length > 0) {
|
|
91
|
+
const combined = swarm.reduce((sum, p) => sum + p.footprint_mb, 0);
|
|
92
|
+
return {
|
|
93
|
+
targets: swarm,
|
|
94
|
+
reason: `Killing swarm of ${swarm.length} ${swarm[0].name} processes (combined ${combined}MB): ${swarm.map(p => `${p.name}(${p.footprint_mb}MB)`).join(", ")}`,
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Priority 3: Heavy & young — kill ALL recently started heavy processes
|
|
99
|
+
const heavyYoung = eligible
|
|
100
|
+
.filter(proc =>
|
|
101
|
+
proc.age_seconds !== null &&
|
|
102
|
+
proc.age_seconds <= HEAVY_YOUNG_MAX_AGE_SECONDS &&
|
|
103
|
+
proc.footprint_mb >= HEAVY_YOUNG_MIN_FOOTPRINT_MB
|
|
104
|
+
)
|
|
105
|
+
.sort((a, b) => b.footprint_mb - a.footprint_mb);
|
|
106
|
+
|
|
107
|
+
if (heavyYoung.length > 0) {
|
|
108
|
+
return {
|
|
109
|
+
targets: heavyYoung,
|
|
110
|
+
reason: `Killing ${heavyYoung.length} heavy & young process(es): ${heavyYoung.map(p => `${p.name}(${p.footprint_mb}MB, age ${p.age_seconds}s)`).join(", ")}`,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Priority 4: Newest — kill ONLY the single largest process that appeared
|
|
115
|
+
// after the system was last stably non-RED. Not batch — weakest signal.
|
|
116
|
+
if (state.lastNonRedTimestamp > 0) {
|
|
117
|
+
const secondsSinceHealthy = (Date.now() / 1000) - state.lastNonRedTimestamp;
|
|
118
|
+
|
|
119
|
+
const newest = eligible
|
|
120
|
+
.filter(proc =>
|
|
121
|
+
proc.age_seconds !== null &&
|
|
122
|
+
proc.age_seconds <= secondsSinceHealthy
|
|
123
|
+
)
|
|
124
|
+
.sort((a, b) => b.footprint_mb - a.footprint_mb);
|
|
125
|
+
|
|
126
|
+
if (newest.length > 0) {
|
|
127
|
+
// Only kill the single largest — temporal correlation alone is weak evidence
|
|
128
|
+
const target = newest[0];
|
|
129
|
+
return {
|
|
130
|
+
targets: [target],
|
|
131
|
+
reason: `Killing newest heavy process (appeared after last stable state ${Math.round(secondsSinceHealthy)}s ago): ${target.name}(${target.footprint_mb}MB, age ${target.age_seconds}s)`,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// No evidence — don't kill. Caller should block commands and wait.
|
|
137
|
+
return null;
|
|
138
|
+
}
|