pi-deadman 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,172 @@
1
+ // signals.ts — macOS kernel memory metrics: sysctl, vm_stat, memory_pressure
2
+
3
+ import { execSync } from "child_process";
4
+
5
+ // Module-level state for rate computation
6
+ let previousVmStat: Record<string, number> | null = null;
7
+ let previousTimestamp: number | null = null;
8
+
9
+ export interface SystemSignals {
10
+ swapout_rate: number; // swapouts/sec delta from previous call, 0 on first call
11
+ swapin_rate: number; // swapins/sec delta
12
+ decomp_rate: number; // decompressions/sec delta
13
+ pressure_level: number; // 1, 2, or 4 from memory_pressure command
14
+ memorystatus_level: number; // 0-100 from sysctl kern.memorystatus_level
15
+ swap_used_mb: number; // from sysctl vm.swapusage
16
+ swap_free_mb: number; // from sysctl vm.swapusage
17
+ compression_ratio: number; // pages_stored / compressor_page_count, min 1.0
18
+ }
19
+
20
+ export async function collectSignals(): Promise<SystemSignals> {
21
+ const timestamp = Date.now() / 1000; // Convert to seconds
22
+
23
+ // Initialize with safe defaults
24
+ let swapout_rate = 0;
25
+ let swapin_rate = 0;
26
+ let decomp_rate = 0;
27
+ let pressure_level = 1;
28
+ let memorystatus_level = 0;
29
+ let swap_used_mb = 0;
30
+ let swap_free_mb = 0;
31
+ let compression_ratio = 1.0;
32
+
33
+ // Get current vm_stat data
34
+ const currentVmStat = getVmStat();
35
+
36
+ // Compute delta-based rates if we have previous data
37
+ if (previousVmStat !== null && previousTimestamp !== null) {
38
+ const timeDelta = timestamp - previousTimestamp;
39
+ if (timeDelta > 0) {
40
+ // Swapout rate: pages written to swap per second
41
+ const prevPageouts = previousVmStat.swapouts || 0;
42
+ const currPageouts = currentVmStat.swapouts || 0;
43
+ swapout_rate = Math.max(0, (currPageouts - prevPageouts) / timeDelta);
44
+
45
+ // Swapin rate: pages read from swap per second (swapins)
46
+ const prevPageins = previousVmStat.swapins || 0;
47
+ const currPageins = currentVmStat.swapins || 0;
48
+ swapin_rate = Math.max(0, (currPageins - prevPageins) / timeDelta);
49
+
50
+ // Decompression rate: pages decompressed per second
51
+ const prevDecomp = previousVmStat.decompressions || 0;
52
+ const currDecomp = currentVmStat.decompressions || 0;
53
+ decomp_rate = Math.max(0, (currDecomp - prevDecomp) / timeDelta);
54
+ }
55
+ }
56
+
57
+ // Store current state for next call
58
+ previousVmStat = currentVmStat;
59
+ previousTimestamp = timestamp;
60
+
61
+ // Get point-in-time signals
62
+ pressure_level = getPressureLevel();
63
+ memorystatus_level = getMemorystatusLevel();
64
+ [swap_used_mb, swap_free_mb] = getSwapUsage();
65
+ compression_ratio = getCompressionRatio(currentVmStat);
66
+
67
+ return {
68
+ swapout_rate,
69
+ swapin_rate,
70
+ decomp_rate,
71
+ pressure_level,
72
+ memorystatus_level,
73
+ swap_used_mb,
74
+ swap_free_mb,
75
+ compression_ratio
76
+ };
77
+ }
78
+
79
+ function getVmStat(): Record<string, number> {
80
+ try {
81
+ const output = execSync("vm_stat", { encoding: "utf8", timeout: 5000 });
82
+ const data: Record<string, number> = {};
83
+
84
+ for (const line of output.trim().split("\n")) {
85
+ if (line.includes(":")) {
86
+ const [key, valueStr] = line.split(":");
87
+ const value = valueStr.trim().replace(/\.$/, ""); // Remove trailing period
88
+ const numValue = parseInt(value, 10);
89
+ if (!isNaN(numValue)) {
90
+ data[key.trim().toLowerCase()] = numValue;
91
+ }
92
+ }
93
+ }
94
+
95
+ return data;
96
+ } catch {
97
+ return {};
98
+ }
99
+ }
100
+
101
+ function getPressureLevel(): number {
102
+ try {
103
+ const output = execSync("sysctl -n kern.memorystatus_vm_pressure_level", {
104
+ encoding: "utf8",
105
+ timeout: 5000
106
+ });
107
+ const level = parseInt(output.trim(), 10);
108
+ if ([1, 2, 4].includes(level)) {
109
+ return level;
110
+ }
111
+ return 1; // Default to normal
112
+ } catch {
113
+ return 1;
114
+ }
115
+ }
116
+
117
+ function getMemorystatusLevel(): number {
118
+ try {
119
+ const output = execSync("sysctl -n kern.memorystatus_level", {
120
+ encoding: "utf8",
121
+ timeout: 5000
122
+ });
123
+ const level = parseInt(output.trim(), 10);
124
+ return isNaN(level) ? 0 : Math.max(0, Math.min(100, level));
125
+ } catch {
126
+ return 0;
127
+ }
128
+ }
129
+
130
+ function getSwapUsage(): [number, number] {
131
+ try {
132
+ const output = execSync("sysctl -n vm.swapusage", {
133
+ encoding: "utf8",
134
+ timeout: 5000
135
+ });
136
+
137
+ let usedMb = 0;
138
+ let freeMb = 0;
139
+
140
+ // Parse "used = X.XXM free = Y.YYM"
141
+ const parts = output.trim().split(/\s+/);
142
+ for (let i = 0; i < parts.length - 2; i++) {
143
+ if (parts[i] === "used" && parts[i + 1] === "=") {
144
+ const value = parseFloat(parts[i + 2].replace("M", ""));
145
+ if (!isNaN(value)) usedMb = value;
146
+ } else if (parts[i] === "free" && parts[i + 1] === "=") {
147
+ const value = parseFloat(parts[i + 2].replace("M", ""));
148
+ if (!isNaN(value)) freeMb = value;
149
+ }
150
+ }
151
+
152
+ return [usedMb, freeMb];
153
+ } catch {
154
+ return [0, 0];
155
+ }
156
+ }
157
+
158
+ function getCompressionRatio(vmData: Record<string, number>): number {
159
+ try {
160
+ const compressorPageCount = vmData["pages occupied by compressor"] || 0;
161
+ const pagesStoredInCompressor = vmData["pages stored in compressor"] || 0;
162
+
163
+ if (compressorPageCount > 0 && pagesStoredInCompressor > 0) {
164
+ const ratio = pagesStoredInCompressor / compressorPageCount;
165
+ return Math.max(1.0, ratio);
166
+ }
167
+
168
+ return 1.0;
169
+ } catch {
170
+ return 1.0;
171
+ }
172
+ }
@@ -0,0 +1,218 @@
1
+ export interface SnapshotProcess {
2
+ pid: number;
3
+ name: string;
4
+ footprint_mb: number;
5
+ age_seconds: number | null;
6
+ }
7
+
8
+ export interface GrowingProcess extends SnapshotProcess {
9
+ delta_mb: number;
10
+ }
11
+
12
+ // Format age in seconds to human-readable string
13
+ export function formatAge(seconds: number | null): string {
14
+ if (seconds === null) {
15
+ return "unknown";
16
+ }
17
+
18
+ if (seconds < 60) {
19
+ return `${seconds}s`;
20
+ }
21
+
22
+ if (seconds < 3600) {
23
+ const minutes = Math.floor(seconds / 60);
24
+ const remainingSeconds = seconds % 60;
25
+ return `${minutes}m ${remainingSeconds}s`;
26
+ }
27
+
28
+ // >= 3600 (1 hour or more)
29
+ const hours = Math.floor(seconds / 3600);
30
+ const remainingMinutes = Math.floor((seconds % 3600) / 60);
31
+ return `${hours}h ${remainingMinutes}m`;
32
+ }
33
+
34
+ /**
35
+ * Parse ps etime format to seconds.
36
+ * Formats: SS, MM:SS, HH:MM:SS, DD-HH:MM:SS
37
+ * Returns null on parse failure.
38
+ */
39
+ export function parseEtime(etime: string): number | null {
40
+ const trimmed = etime.trim();
41
+ if (!trimmed) return null;
42
+
43
+ try {
44
+ let days = 0;
45
+ let timePart = trimmed;
46
+
47
+ if (timePart.includes("-")) {
48
+ const [dayStr, rest] = timePart.split("-", 2);
49
+ days = parseInt(dayStr, 10);
50
+ if (isNaN(days)) return null;
51
+ timePart = rest;
52
+ }
53
+
54
+ const parts = timePart.split(":").map(p => parseInt(p, 10));
55
+ if (parts.some(isNaN)) return null;
56
+
57
+ let seconds = days * 86400;
58
+ if (parts.length === 3) {
59
+ seconds += parts[0] * 3600 + parts[1] * 60 + parts[2];
60
+ } else if (parts.length === 2) {
61
+ seconds += parts[0] * 60 + parts[1];
62
+ } else if (parts.length === 1) {
63
+ seconds += parts[0];
64
+ } else {
65
+ return null;
66
+ }
67
+
68
+ return seconds;
69
+ } catch {
70
+ return null;
71
+ }
72
+ }
73
+
74
+ // Natural language description of a process
75
+ export function describeProcess(proc: SnapshotProcess): string {
76
+ const age = formatAge(proc.age_seconds);
77
+ return `${proc.name} [PID ${proc.pid}] (${proc.footprint_mb} MB, running ${age})`;
78
+ }
79
+
80
+ // Find processes in current that don't exist in previous (by PID)
81
+ export function findNewProcesses(
82
+ previous: SnapshotProcess[],
83
+ current: SnapshotProcess[],
84
+ ): SnapshotProcess[] {
85
+ const previousPids = new Set(previous.map(p => p.pid));
86
+ return current.filter(proc => !previousPids.has(proc.pid));
87
+ }
88
+
89
+ /**
90
+ * Find processes with sustained growth across a snapshot history.
91
+ *
92
+ * "Growing" = present in at least 3 snapshots, footprint increased between
93
+ * consecutive pairs in at least 3 of those transitions, AND total delta
94
+ * (newest - oldest observed footprint) >= thresholdMb.
95
+ *
96
+ * This is resilient to momentary plateaus — a process that grows, holds,
97
+ * then grows again still qualifies as long as 3+ transitions show increases.
98
+ *
99
+ * @param snapshotHistory - ring buffer of snapshots, oldest first (max ~10).
100
+ * @param currentChildren - current process list (used to filter to eligible PIDs only)
101
+ * @param thresholdMb - minimum total delta from oldest to newest (default 200)
102
+ * @param minGrowthIntervals - minimum number of intervals showing growth (default 3)
103
+ */
104
+ export function findGrowingProcesses(
105
+ snapshotHistory: SnapshotProcess[][],
106
+ currentChildren: SnapshotProcess[],
107
+ thresholdMb: number = 200,
108
+ minGrowthIntervals: number = 3,
109
+ ): GrowingProcess[] {
110
+ // Need at least 3 snapshots to have 2 intervals
111
+ if (snapshotHistory.length < 3) {
112
+ return [];
113
+ }
114
+
115
+ const eligiblePids = new Set(currentChildren.map(p => p.pid));
116
+ const growing: GrowingProcess[] = [];
117
+
118
+ for (const pid of eligiblePids) {
119
+ // Collect footprint values for this PID across all snapshots (skip gaps)
120
+ const footprints: number[] = [];
121
+ for (const snapshot of snapshotHistory) {
122
+ const proc = snapshot.find(p => p.pid === pid);
123
+ if (proc) {
124
+ footprints.push(proc.footprint_mb);
125
+ }
126
+ // Missing from a snapshot is OK — we just skip it
127
+ }
128
+
129
+ // Need the PID present in at least 3 snapshots
130
+ if (footprints.length < 3) continue;
131
+
132
+ // Count intervals where footprint increased
133
+ let growthIntervals = 0;
134
+ for (let i = 1; i < footprints.length; i++) {
135
+ if (footprints[i] > footprints[i - 1]) {
136
+ growthIntervals++;
137
+ }
138
+ }
139
+
140
+ if (growthIntervals < minGrowthIntervals) continue;
141
+
142
+ // Check total delta meets threshold (newest - oldest observed)
143
+ const totalDelta = footprints[footprints.length - 1] - footprints[0];
144
+ if (totalDelta < thresholdMb) continue;
145
+
146
+ const currentProc = currentChildren.find(p => p.pid === pid)!;
147
+ growing.push({
148
+ ...currentProc,
149
+ delta_mb: totalDelta,
150
+ });
151
+ }
152
+
153
+ // Sort by total delta descending
154
+ return growing.sort((a, b) => b.delta_mb - a.delta_mb);
155
+ }
156
+
157
+ // Find the largest group of processes with the same name (>= 2 members)
158
+ // Returns the group, or empty array if no group has >= 2 members
159
+ export function findSimilarGroup(processes: SnapshotProcess[]): SnapshotProcess[] {
160
+ // Group processes by name
161
+ const groups = new Map<string, SnapshotProcess[]>();
162
+
163
+ for (const proc of processes) {
164
+ if (!groups.has(proc.name)) {
165
+ groups.set(proc.name, []);
166
+ }
167
+ groups.get(proc.name)!.push(proc);
168
+ }
169
+
170
+ // Find the largest group with >= 2 members
171
+ let largestGroup: SnapshotProcess[] = [];
172
+
173
+ for (const group of groups.values()) {
174
+ if (group.length >= 2 && group.length > largestGroup.length) {
175
+ largestGroup = group;
176
+ }
177
+ }
178
+
179
+ return largestGroup;
180
+ }
181
+
182
+ /**
183
+ * Find the heaviest swarm of same-name processes.
184
+ *
185
+ * A "swarm" is >= minCount processes with the same name whose combined
186
+ * footprint >= minCombinedMb. Returns the group with the highest combined
187
+ * footprint, sorted by individual footprint descending.
188
+ *
189
+ * Use case: 7 vitest workers each at 500 MB = 3.5 GB swarm.
190
+ */
191
+ export function findSwarm(
192
+ processes: SnapshotProcess[],
193
+ minCount: number = 3,
194
+ minCombinedMb: number = 500,
195
+ ): SnapshotProcess[] {
196
+ const groups = new Map<string, SnapshotProcess[]>();
197
+
198
+ for (const proc of processes) {
199
+ if (!groups.has(proc.name)) {
200
+ groups.set(proc.name, []);
201
+ }
202
+ groups.get(proc.name)!.push(proc);
203
+ }
204
+
205
+ let bestGroup: SnapshotProcess[] = [];
206
+ let bestCombined = 0;
207
+
208
+ for (const group of groups.values()) {
209
+ if (group.length < minCount) continue;
210
+ const combined = group.reduce((sum, p) => sum + p.footprint_mb, 0);
211
+ if (combined >= minCombinedMb && combined > bestCombined) {
212
+ bestGroup = group;
213
+ bestCombined = combined;
214
+ }
215
+ }
216
+
217
+ return bestGroup.sort((a, b) => b.footprint_mb - a.footprint_mb);
218
+ }
@@ -0,0 +1,138 @@
1
+ import type { SnapshotProcess } from "./tree";
2
+ import { findGrowingProcesses, findSwarm } from "./tree";
3
+
4
+ export interface WatchdogState {
5
+ /** Ring buffer of recent snapshots (oldest first, max 10). Each entry is pi's child process list. */
6
+ snapshotHistory: SnapshotProcess[][];
7
+ /** Epoch seconds of the last snapshot taken while system was stably non-RED. */
8
+ lastNonRedTimestamp: number;
9
+ /** Count of consecutive non-RED polls. Used for hysteresis on lastNonRedTimestamp. */
10
+ consecutiveNonRedPolls: number;
11
+ lastKillTime: number; // Date.now() of last kill, 0 if never
12
+ cooldownMs: number; // minimum ms between kills
13
+ }
14
+
15
+ export interface KillDecision {
16
+ targets: SnapshotProcess[]; // processes to kill
17
+ reason: string; // human-readable reason
18
+ }
19
+
20
+ // Should we auto-kill? Only on confirmed RED.
21
+ export function shouldAutoKill(zone: string, confirmed: boolean): boolean {
22
+ return zone === "RED" && confirmed === true;
23
+ }
24
+
25
+ /** Minimum footprint to be considered a kill candidate. Filters infrastructure noise. */
26
+ export const MIN_FOOTPRINT_MB = 50;
27
+
28
+ /** Thresholds for growth detection */
29
+ export const GROWTH_THRESHOLD_MB = 100;
30
+
31
+ /** Thresholds for swarm detection */
32
+ export const SWARM_MIN_COUNT = 3;
33
+ export const SWARM_MIN_COMBINED_MB = 500;
34
+
35
+ /** Thresholds for "heavy & young" detection */
36
+ export const HEAVY_YOUNG_MAX_AGE_SECONDS = 600; // 10 minutes
37
+ export const HEAVY_YOUNG_MIN_FOOTPRINT_MB = 200; // 200 MB (was 1 GB)
38
+
39
+ /**
40
+ * Select what to kill from process list.
41
+ *
42
+ * Priority chain:
43
+ * Floor: processes with footprint < 50 MB are never kill candidates
44
+ * 1. Growing — sustained delta ≥100 MB across 3+ of last 10 snapshots → kill ALL
45
+ * 2. Swarm — ≥3 same-name processes, combined footprint ≥500 MB → kill ALL in swarm
46
+ * 3. Heavy & young — age < 10 min AND footprint ≥ 200 MB → kill ALL, largest first
47
+ * 4. Newest — appeared after last stable non-RED state → kill LARGEST ONLY (not batch)
48
+ * 5. No match → don't kill, block commands and wait
49
+ *
50
+ * Returns null if nothing should be killed (no evidence, cooldown, empty).
51
+ */
52
+ export function selectKillTarget(
53
+ currentChildren: SnapshotProcess[],
54
+ state: WatchdogState,
55
+ maxAgeSeconds: number,
56
+ protectedPid?: number,
57
+ ): KillDecision | null {
58
+ // 1. Check cooldown
59
+ if (state.lastKillTime > 0 && Date.now() - state.lastKillTime < state.cooldownMs) {
60
+ return null;
61
+ }
62
+
63
+ // 2. Apply footprint floor + age filter + protected PID
64
+ let eligible = currentChildren.filter(proc =>
65
+ proc.footprint_mb >= MIN_FOOTPRINT_MB &&
66
+ (proc.age_seconds === null || proc.age_seconds <= maxAgeSeconds)
67
+ );
68
+
69
+ if (protectedPid !== undefined) {
70
+ eligible = eligible.filter(proc => proc.pid !== protectedPid);
71
+ }
72
+
73
+ if (eligible.length === 0) {
74
+ return null;
75
+ }
76
+
77
+ // Priority 1: Growing — kill ALL processes with sustained growth
78
+ if (state.snapshotHistory.length >= 3) {
79
+ const growing = findGrowingProcesses(state.snapshotHistory, eligible, GROWTH_THRESHOLD_MB);
80
+ if (growing.length > 0) {
81
+ return {
82
+ targets: growing,
83
+ reason: `Killing ${growing.length} growing process(es): ${growing.map(p => `${p.name}(+${p.delta_mb}MB)`).join(", ")}`,
84
+ };
85
+ }
86
+ }
87
+
88
+ // Priority 2: Swarm — kill ALL in the heaviest same-name cluster
89
+ const swarm = findSwarm(eligible, SWARM_MIN_COUNT, SWARM_MIN_COMBINED_MB);
90
+ if (swarm.length > 0) {
91
+ const combined = swarm.reduce((sum, p) => sum + p.footprint_mb, 0);
92
+ return {
93
+ targets: swarm,
94
+ reason: `Killing swarm of ${swarm.length} ${swarm[0].name} processes (combined ${combined}MB): ${swarm.map(p => `${p.name}(${p.footprint_mb}MB)`).join(", ")}`,
95
+ };
96
+ }
97
+
98
+ // Priority 3: Heavy & young — kill ALL recently started heavy processes
99
+ const heavyYoung = eligible
100
+ .filter(proc =>
101
+ proc.age_seconds !== null &&
102
+ proc.age_seconds <= HEAVY_YOUNG_MAX_AGE_SECONDS &&
103
+ proc.footprint_mb >= HEAVY_YOUNG_MIN_FOOTPRINT_MB
104
+ )
105
+ .sort((a, b) => b.footprint_mb - a.footprint_mb);
106
+
107
+ if (heavyYoung.length > 0) {
108
+ return {
109
+ targets: heavyYoung,
110
+ reason: `Killing ${heavyYoung.length} heavy & young process(es): ${heavyYoung.map(p => `${p.name}(${p.footprint_mb}MB, age ${p.age_seconds}s)`).join(", ")}`,
111
+ };
112
+ }
113
+
114
+ // Priority 4: Newest — kill ONLY the single largest process that appeared
115
+ // after the system was last stably non-RED. Not batch — weakest signal.
116
+ if (state.lastNonRedTimestamp > 0) {
117
+ const secondsSinceHealthy = (Date.now() / 1000) - state.lastNonRedTimestamp;
118
+
119
+ const newest = eligible
120
+ .filter(proc =>
121
+ proc.age_seconds !== null &&
122
+ proc.age_seconds <= secondsSinceHealthy
123
+ )
124
+ .sort((a, b) => b.footprint_mb - a.footprint_mb);
125
+
126
+ if (newest.length > 0) {
127
+ // Only kill the single largest — temporal correlation alone is weak evidence
128
+ const target = newest[0];
129
+ return {
130
+ targets: [target],
131
+ reason: `Killing newest heavy process (appeared after last stable state ${Math.round(secondsSinceHealthy)}s ago): ${target.name}(${target.footprint_mb}MB, age ${target.age_seconds}s)`,
132
+ };
133
+ }
134
+ }
135
+
136
+ // No evidence — don't kill. Caller should block commands and wait.
137
+ return null;
138
+ }