@appkit/llamacpp-cli 1.4.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/CHANGELOG.md +21 -0
  2. package/MONITORING-ACCURACY-FIX.md +199 -0
  3. package/PER-PROCESS-METRICS.md +190 -0
  4. package/README.md +136 -1
  5. package/dist/cli.js +21 -4
  6. package/dist/cli.js.map +1 -1
  7. package/dist/commands/create.d.ts.map +1 -1
  8. package/dist/commands/create.js +12 -3
  9. package/dist/commands/create.js.map +1 -1
  10. package/dist/commands/monitor.d.ts +2 -0
  11. package/dist/commands/monitor.d.ts.map +1 -0
  12. package/dist/commands/monitor.js +126 -0
  13. package/dist/commands/monitor.js.map +1 -0
  14. package/dist/commands/ps.d.ts +3 -1
  15. package/dist/commands/ps.d.ts.map +1 -1
  16. package/dist/commands/ps.js +75 -5
  17. package/dist/commands/ps.js.map +1 -1
  18. package/dist/commands/server-show.d.ts.map +1 -1
  19. package/dist/commands/server-show.js +10 -3
  20. package/dist/commands/server-show.js.map +1 -1
  21. package/dist/commands/start.d.ts.map +1 -1
  22. package/dist/commands/start.js +14 -2
  23. package/dist/commands/start.js.map +1 -1
  24. package/dist/lib/history-manager.d.ts +46 -0
  25. package/dist/lib/history-manager.d.ts.map +1 -0
  26. package/dist/lib/history-manager.js +157 -0
  27. package/dist/lib/history-manager.js.map +1 -0
  28. package/dist/lib/metrics-aggregator.d.ts +40 -0
  29. package/dist/lib/metrics-aggregator.d.ts.map +1 -0
  30. package/dist/lib/metrics-aggregator.js +211 -0
  31. package/dist/lib/metrics-aggregator.js.map +1 -0
  32. package/dist/lib/system-collector.d.ts +80 -0
  33. package/dist/lib/system-collector.d.ts.map +1 -0
  34. package/dist/lib/system-collector.js +311 -0
  35. package/dist/lib/system-collector.js.map +1 -0
  36. package/dist/tui/HistoricalMonitorApp.d.ts +5 -0
  37. package/dist/tui/HistoricalMonitorApp.d.ts.map +1 -0
  38. package/dist/tui/HistoricalMonitorApp.js +490 -0
  39. package/dist/tui/HistoricalMonitorApp.js.map +1 -0
  40. package/dist/tui/MonitorApp.d.ts +4 -0
  41. package/dist/tui/MonitorApp.d.ts.map +1 -0
  42. package/dist/tui/MonitorApp.js +315 -0
  43. package/dist/tui/MonitorApp.js.map +1 -0
  44. package/dist/tui/MultiServerMonitorApp.d.ts +4 -0
  45. package/dist/tui/MultiServerMonitorApp.d.ts.map +1 -0
  46. package/dist/tui/MultiServerMonitorApp.js +712 -0
  47. package/dist/tui/MultiServerMonitorApp.js.map +1 -0
  48. package/dist/types/history-types.d.ts +30 -0
  49. package/dist/types/history-types.d.ts.map +1 -0
  50. package/dist/types/history-types.js +11 -0
  51. package/dist/types/history-types.js.map +1 -0
  52. package/dist/types/monitor-types.d.ts +123 -0
  53. package/dist/types/monitor-types.d.ts.map +1 -0
  54. package/dist/types/monitor-types.js +3 -0
  55. package/dist/types/monitor-types.js.map +1 -0
  56. package/dist/types/server-config.d.ts +1 -0
  57. package/dist/types/server-config.d.ts.map +1 -1
  58. package/dist/types/server-config.js.map +1 -1
  59. package/dist/utils/downsample-utils.d.ts +35 -0
  60. package/dist/utils/downsample-utils.d.ts.map +1 -0
  61. package/dist/utils/downsample-utils.js +107 -0
  62. package/dist/utils/downsample-utils.js.map +1 -0
  63. package/dist/utils/file-utils.d.ts +6 -0
  64. package/dist/utils/file-utils.d.ts.map +1 -1
  65. package/dist/utils/file-utils.js +38 -0
  66. package/dist/utils/file-utils.js.map +1 -1
  67. package/dist/utils/process-utils.d.ts +35 -2
  68. package/dist/utils/process-utils.d.ts.map +1 -1
  69. package/dist/utils/process-utils.js +220 -25
  70. package/dist/utils/process-utils.js.map +1 -1
  71. package/docs/images/.gitkeep +1 -0
  72. package/package.json +5 -1
  73. package/src/cli.ts +21 -4
  74. package/src/commands/create.ts +14 -4
  75. package/src/commands/monitor.ts +110 -0
  76. package/src/commands/ps.ts +88 -5
  77. package/src/commands/server-show.ts +10 -3
  78. package/src/commands/start.ts +15 -2
  79. package/src/lib/history-manager.ts +172 -0
  80. package/src/lib/metrics-aggregator.ts +257 -0
  81. package/src/lib/system-collector.ts +315 -0
  82. package/src/tui/HistoricalMonitorApp.ts +548 -0
  83. package/src/tui/MonitorApp.ts +386 -0
  84. package/src/tui/MultiServerMonitorApp.ts +792 -0
  85. package/src/types/history-types.ts +39 -0
  86. package/src/types/monitor-types.ts +162 -0
  87. package/src/types/server-config.ts +1 -0
  88. package/src/utils/downsample-utils.ts +128 -0
  89. package/src/utils/file-utils.ts +40 -0
  90. package/src/utils/process-utils.ts +243 -25
  91. package/test-load.sh +100 -0
@@ -0,0 +1,39 @@
1
+ // Historical monitoring data types
2
+
3
+ export interface HistorySnapshot {
4
+ timestamp: number; // Unix timestamp in milliseconds
5
+ server: {
6
+ healthy: boolean;
7
+ uptime?: string;
8
+ activeSlots: number;
9
+ idleSlots: number;
10
+ totalSlots: number;
11
+ avgPromptSpeed?: number; // Tokens per second
12
+ avgGenerateSpeed?: number; // Tokens per second
13
+ processMemory?: number; // Bytes (RSS)
14
+ processCpuUsage?: number; // Percentage (0-100+) from ps
15
+ };
16
+ system?: {
17
+ gpuUsage?: number; // Percentage (0-100)
18
+ cpuUsage?: number; // Percentage (0-100)
19
+ aneUsage?: number; // Percentage (0-100)
20
+ temperature?: number; // Celsius
21
+ memoryUsed: number; // Bytes
22
+ memoryTotal: number; // Bytes
23
+ };
24
+ }
25
+
26
+ export interface HistoryData {
27
+ serverId: string;
28
+ snapshots: HistorySnapshot[];
29
+ }
30
+
31
+ export type TimeWindow = '1h' | '6h' | '24h';
32
+
33
+ export const TIME_WINDOW_HOURS: Record<TimeWindow, number> = {
34
+ '1h': 1,
35
+ '6h': 6,
36
+ '24h': 24,
37
+ };
38
+
39
+ export const TIME_WINDOWS: TimeWindow[] = ['1h', '6h', '24h'];
@@ -0,0 +1,162 @@
1
+ import { ServerConfig } from './server-config.js';
2
+
3
+ // llama.cpp API response types
4
+
5
+ export interface HealthResponse {
6
+ status: string;
7
+ error?: string;
8
+ }
9
+
10
+ export interface PropsResponse {
11
+ default_generation_settings: {
12
+ n_ctx: number;
13
+ n_predict: number;
14
+ model: string;
15
+ seed: number;
16
+ temperature: number;
17
+ top_k: number;
18
+ top_p: number;
19
+ min_p: number;
20
+ n_keep: number;
21
+ stream: boolean;
22
+ };
23
+ total_slots: number;
24
+ model_loaded: boolean;
25
+ model_path: string;
26
+ model_alias?: string;
27
+ }
28
+
29
+ export interface SlotInfo {
30
+ id: number;
31
+ state: 'idle' | 'processing';
32
+ task_id?: number;
33
+ prompt?: string;
34
+ n_prompt_tokens?: number;
35
+ n_decoded?: number;
36
+ n_ctx: number;
37
+ truncated?: boolean;
38
+ stopped_eos?: boolean;
39
+ stopped_word?: boolean;
40
+ stopped_limit?: boolean;
41
+ stopping_word?: string;
42
+ tokens_predicted?: number;
43
+ tokens_evaluated?: number;
44
+ generation_settings?: {
45
+ n_ctx: number;
46
+ n_predict: number;
47
+ seed: number;
48
+ temperature: number;
49
+ top_k: number;
50
+ top_p: number;
51
+ };
52
+ prompt_tokens_processed?: number;
53
+ t_prompt_processing?: number; // Time in ms
54
+ t_token_generation?: number; // Time in ms
55
+ timings?: {
56
+ prompt_n: number;
57
+ prompt_ms: number;
58
+ prompt_per_token_ms: number;
59
+ prompt_per_second: number;
60
+ predicted_n: number;
61
+ predicted_ms: number;
62
+ predicted_per_token_ms: number;
63
+ predicted_per_second: number;
64
+ };
65
+ }
66
+
67
+ export interface SlotsResponse {
68
+ slots: SlotInfo[];
69
+ }
70
+
71
+ // System metrics types
72
+
73
+ export interface SystemMetrics {
74
+ // GPU/CPU/ANE (from macmon if available)
75
+ gpuUsage?: number; // Percentage (0-100)
76
+ cpuUsage?: number; // Percentage (0-100)
77
+ cpuCores?: number; // Number of cores
78
+ aneUsage?: number; // Apple Neural Engine percentage (0-100)
79
+ temperature?: number; // GPU temperature in Celsius
80
+
81
+ // Memory (from vm_stat or macmon)
82
+ memoryUsed: number; // Bytes
83
+ memoryTotal: number; // Bytes
84
+ swapUsed?: number; // Bytes
85
+ processMemory?: number; // Bytes (specific to llama-server process)
86
+
87
+ // Metadata
88
+ timestamp: number;
89
+ source: 'macmon' | 'vm_stat' | 'none';
90
+ warnings?: string[]; // e.g., "macmon not available, showing memory only"
91
+ }
92
+
93
+ // Aggregated metrics for TUI display
94
+
95
+ export interface ServerMetrics {
96
+ // Server identification
97
+ server: ServerConfig;
98
+
99
+ // Health status
100
+ healthy: boolean;
101
+ uptime?: string; // Human-readable (e.g., "2h 34m 12s")
102
+ error?: string;
103
+
104
+ // Model information
105
+ modelLoaded: boolean;
106
+ modelName: string;
107
+ contextSize: number;
108
+ totalSlots: number;
109
+
110
+ // Request metrics
111
+ activeSlots: number;
112
+ idleSlots: number;
113
+ slots: SlotInfo[];
114
+
115
+ // Performance metrics (derived from slots)
116
+ avgPromptSpeed?: number; // Tokens per second
117
+ avgGenerateSpeed?: number; // Tokens per second
118
+ requestsPerMinute?: number; // Estimated from slot activity
119
+ avgLatency?: number; // Milliseconds
120
+
121
+ // Cache metrics (if available from /metrics endpoint)
122
+ cacheHitRate?: number; // Percentage
123
+
124
+ // Process metrics
125
+ processMemory?: number; // Bytes (actual RSS from top command)
126
+ processCpuUsage?: number; // Percentage (0-100+) from ps command
127
+
128
+ // Timestamp
129
+ timestamp: number;
130
+ stale: boolean; // True if data is from last successful fetch
131
+ }
132
+
133
+ export interface MonitorData {
134
+ server: ServerMetrics;
135
+ system?: SystemMetrics;
136
+ lastUpdated: Date;
137
+ updateInterval: number; // Milliseconds
138
+ consecutiveFailures: number;
139
+ }
140
+
141
+ // Error and loading states
142
+
143
+ export interface ErrorState {
144
+ error: string;
145
+ canRetry: boolean;
146
+ suggestions?: string[];
147
+ }
148
+
149
+ export interface LoadingState {
150
+ message: string;
151
+ progress?: number; // 0-100 if determinate
152
+ }
153
+
154
+ // Collection result (for graceful degradation)
155
+
156
+ export interface CollectionResult<T> {
157
+ success: boolean;
158
+ data: T | null;
159
+ error?: string;
160
+ warnings?: string[];
161
+ stale?: boolean;
162
+ }
@@ -22,6 +22,7 @@ export interface ServerConfig {
22
22
  createdAt: string; // ISO timestamp
23
23
  lastStarted?: string; // ISO timestamp
24
24
  lastStopped?: string; // ISO timestamp
25
+ metalMemoryMB?: number; // Metal (GPU) memory allocated in MB (parsed from logs)
25
26
 
26
27
  // launchctl metadata
27
28
  plistPath: string; // Full path to plist file
@@ -0,0 +1,128 @@
1
+ /**
2
+ * Downsampling utilities for time-series chart data
3
+ * Uses time-aligned buckets to ensure stable charts as new data arrives
4
+ */
5
+
6
+ export interface TimeSeriesPoint {
7
+ timestamp: number;
8
+ value: number;
9
+ }
10
+
11
+ type AggregationMethod = 'max' | 'mean';
12
+
13
+ const ONE_HOUR_MS = 60 * 60 * 1000;
14
+
15
+ /**
16
+ * Core bucketing logic shared by all downsampling functions.
17
+ * Uses ABSOLUTE bucket boundaries that never shift, ensuring chart stability.
18
+ */
19
+ function createTimeBuckets(
20
+ data: TimeSeriesPoint[],
21
+ targetPoints: number,
22
+ startTime: number,
23
+ endTime: number
24
+ ): number[][] {
25
+ const timeRange = endTime - startTime;
26
+ const bucketDuration = Math.ceil(timeRange / targetPoints);
27
+ const alignedStart = Math.floor(startTime / bucketDuration) * bucketDuration;
28
+ const buckets: number[][] = Array.from({ length: targetPoints }, () => []);
29
+
30
+ for (const point of data) {
31
+ if (point.timestamp < startTime || point.timestamp > endTime) continue;
32
+ const bucketIndex = Math.floor((point.timestamp - alignedStart) / bucketDuration);
33
+ if (bucketIndex >= 0 && bucketIndex < targetPoints) {
34
+ buckets[bucketIndex].push(point.value);
35
+ }
36
+ }
37
+
38
+ return buckets;
39
+ }
40
+
41
+ /**
42
+ * Aggregate bucket values using the specified method.
43
+ */
44
+ function aggregateBuckets(buckets: number[][], method: AggregationMethod): number[] {
45
+ return buckets.map(bucket => {
46
+ const validValues = method === 'max'
47
+ ? bucket.filter(v => !isNaN(v) && v > 0)
48
+ : bucket.filter(v => !isNaN(v) && isFinite(v));
49
+
50
+ if (validValues.length === 0) return 0;
51
+
52
+ if (method === 'max') {
53
+ return Math.max(...validValues);
54
+ }
55
+ return validValues.reduce((sum, v) => sum + v, 0) / validValues.length;
56
+ });
57
+ }
58
+
59
+ /**
60
+ * Downsample using time-aligned bucket max - preserves peaks.
61
+ * Best for: GPU/CPU usage, token speeds where peaks matter.
62
+ */
63
+ export function downsampleMaxTime(data: TimeSeriesPoint[], targetPoints: number): number[] {
64
+ if (data.length === 0) return [];
65
+ if (data.length <= targetPoints) return data.map(d => d.value);
66
+
67
+ const buckets = createTimeBuckets(
68
+ data,
69
+ targetPoints,
70
+ data[0].timestamp,
71
+ data[data.length - 1].timestamp
72
+ );
73
+ return aggregateBuckets(buckets, 'max');
74
+ }
75
+
76
+ /**
77
+ * Downsample using time-aligned bucket mean - preserves average trends.
78
+ * Best for: Memory usage where average is meaningful.
79
+ */
80
+ export function downsampleMeanTime(data: TimeSeriesPoint[], targetPoints: number): number[] {
81
+ if (data.length === 0) return [];
82
+ if (data.length <= targetPoints) return data.map(d => d.value);
83
+
84
+ const buckets = createTimeBuckets(
85
+ data,
86
+ targetPoints,
87
+ data[0].timestamp,
88
+ data[data.length - 1].timestamp
89
+ );
90
+ return aggregateBuckets(buckets, 'mean');
91
+ }
92
+
93
+ /**
94
+ * Calculate downsampling ratio as a display string.
95
+ */
96
+ export function getDownsampleRatio(originalCount: number, targetCount: number): string {
97
+ if (originalCount <= targetCount) return '1:1';
98
+ const ratio = Math.round(originalCount / targetCount);
99
+ return `${ratio}:1`;
100
+ }
101
+
102
+ /**
103
+ * Downsample with full hour coverage using max aggregation.
104
+ * Creates buckets for the entire hour (60 minutes), filling gaps with 0.
105
+ * Best for: Hour view where we want to show the full time range.
106
+ */
107
+ export function downsampleMaxTimeWithFullHour(data: TimeSeriesPoint[], targetPoints: number): number[] {
108
+ if (data.length === 0) return Array(targetPoints).fill(0);
109
+
110
+ const now = Date.now();
111
+ const oneHourAgo = now - ONE_HOUR_MS;
112
+ const buckets = createTimeBuckets(data, targetPoints, oneHourAgo, now);
113
+ return aggregateBuckets(buckets, 'max');
114
+ }
115
+
116
+ /**
117
+ * Downsample with full hour coverage using mean aggregation.
118
+ * Creates buckets for the entire hour (60 minutes), filling gaps with 0.
119
+ * Best for: Hour view where we want to show the full time range.
120
+ */
121
+ export function downsampleMeanTimeWithFullHour(data: TimeSeriesPoint[], targetPoints: number): number[] {
122
+ if (data.length === 0) return Array(targetPoints).fill(0);
123
+
124
+ const now = Date.now();
125
+ const oneHourAgo = now - ONE_HOUR_MS;
126
+ const buckets = createTimeBuckets(data, targetPoints, oneHourAgo, now);
127
+ return aggregateBuckets(buckets, 'mean');
128
+ }
@@ -104,3 +104,43 @@ export function expandHome(filePath: string): string {
104
104
  }
105
105
  return filePath;
106
106
  }
107
+
108
+ /**
109
+ * Parse Metal (GPU) memory allocation from llama-server stderr logs
110
+ * Looks for line: "load_tensors: Metal_Mapped model buffer size = 11120.23 MiB"
111
+ * Returns memory in MB, or null if not found
112
+ */
113
+ export async function parseMetalMemoryFromLog(stderrPath: string): Promise<number | null> {
114
+ try {
115
+ // Check if log file exists
116
+ if (!(await fileExists(stderrPath))) {
117
+ return null;
118
+ }
119
+
120
+ // Open file for reading
121
+ const fileHandle = await fs.open(stderrPath, 'r');
122
+
123
+ try {
124
+ // Read first 256KB (Metal allocation happens early during model loading)
125
+ const buffer = Buffer.alloc(256 * 1024);
126
+ const { bytesRead } = await fileHandle.read(buffer, 0, buffer.length, 0);
127
+ const content = buffer.toString('utf-8', 0, bytesRead);
128
+ const lines = content.split('\n');
129
+
130
+ // Look for Metal_Mapped buffer size
131
+ for (const line of lines) {
132
+ const match = line.match(/Metal_Mapped model buffer size\s*=\s*([\d.]+)\s*MiB/);
133
+ if (match) {
134
+ const sizeInMB = parseFloat(match[1]);
135
+ return isNaN(sizeInMB) ? null : sizeInMB;
136
+ }
137
+ }
138
+
139
+ return null;
140
+ } finally {
141
+ await fileHandle.close();
142
+ }
143
+ } catch {
144
+ return null;
145
+ }
146
+ }
@@ -1,4 +1,4 @@
1
- import { exec } from 'child_process';
1
+ import { exec, spawn } from 'child_process';
2
2
  import { promisify } from 'util';
3
3
 
4
4
  export const execAsync = promisify(exec);
@@ -60,39 +60,257 @@ export async function isPortInUse(port: number): Promise<boolean> {
60
60
  }
61
61
 
62
62
  /**
63
- * Get memory usage for a process in bytes
64
- * Uses 'top' on macOS which includes GPU/Metal memory (more accurate for llama-server)
63
+ * Spawn a streaming command, read one line, and kill it
64
+ * Useful for commands like 'macmon pipe' that stream indefinitely
65
+ * Ensures the process is killed to prevent leaks
66
+ */
67
+ export async function spawnAndReadOneLine(
68
+ command: string,
69
+ args: string[],
70
+ timeoutMs: number = 2000
71
+ ): Promise<string | null> {
72
+ return new Promise((resolve) => {
73
+ const child = spawn(command, args, {
74
+ stdio: ['ignore', 'pipe', 'ignore'],
75
+ detached: false, // Keep in same process group for easier cleanup
76
+ });
77
+
78
+ let resolved = false;
79
+ let output = '';
80
+
81
+ const cleanup = () => {
82
+ try {
83
+ // Try SIGKILL immediately (SIGTERM may not work for macmon)
84
+ child.kill('SIGKILL');
85
+ } catch {
86
+ // Process might already be dead
87
+ }
88
+ };
89
+
90
+ // Set timeout to kill process if it doesn't produce output
91
+ const timeout = setTimeout(() => {
92
+ if (!resolved) {
93
+ resolved = true;
94
+ cleanup();
95
+ resolve(null);
96
+ }
97
+ }, timeoutMs);
98
+
99
+ // Read stdout line by line
100
+ child.stdout?.on('data', (data) => {
101
+ if (resolved) return;
102
+
103
+ output += data.toString();
104
+
105
+ // Check if we have a complete line
106
+ const newlineIndex = output.indexOf('\n');
107
+ if (newlineIndex !== -1) {
108
+ const line = output.substring(0, newlineIndex).trim();
109
+
110
+ if (line.length > 0) {
111
+ resolved = true;
112
+ clearTimeout(timeout);
113
+ cleanup();
114
+ resolve(line);
115
+ }
116
+ }
117
+ });
118
+
119
+ // Handle process errors
120
+ child.on('error', () => {
121
+ if (!resolved) {
122
+ resolved = true;
123
+ clearTimeout(timeout);
124
+ resolve(null);
125
+ }
126
+ });
127
+
128
+ // Handle process exit
129
+ child.on('exit', () => {
130
+ if (!resolved) {
131
+ resolved = true;
132
+ clearTimeout(timeout);
133
+
134
+ // Return partial output if we have any
135
+ const line = output.trim();
136
+ resolve(line.length > 0 ? line : null);
137
+ }
138
+ });
139
+ });
140
+ }
141
+
142
+ // Process memory cache to prevent spawning too many 'top' processes
143
+ // Cache per PID with 3-second TTL
144
+ const processMemoryCache = new Map<number, { value: number | null; timestamp: number }>();
145
+ const PROCESS_MEMORY_CACHE_TTL = 3000; // 3 seconds
146
+
147
+ /**
148
+ * Batch get memory usage for multiple processes in one top call
149
+ * Much more efficient than calling getProcessMemory() multiple times
150
+ * Returns Map<pid, bytes> for all requested PIDs
151
+ */
152
+ export async function getBatchProcessMemory(pids: number[]): Promise<Map<number, number | null>> {
153
+ const result = new Map<number, number | null>();
154
+ const now = Date.now();
155
+
156
+ // Check cache and collect PIDs that need fetching
157
+ const pidsToFetch: number[] = [];
158
+ for (const pid of pids) {
159
+ const cached = processMemoryCache.get(pid);
160
+ if (cached && (now - cached.timestamp) < PROCESS_MEMORY_CACHE_TTL) {
161
+ result.set(pid, cached.value);
162
+ } else {
163
+ pidsToFetch.push(pid);
164
+ }
165
+ }
166
+
167
+ // If all PIDs were cached, return early
168
+ if (pidsToFetch.length === 0) {
169
+ return result;
170
+ }
171
+
172
+ try {
173
+ // Build top command with all PIDs: top -l 1 -pid X -pid Y -pid Z -stats pid,mem
174
+ const pidArgs = pidsToFetch.map(pid => `-pid ${pid}`).join(' ');
175
+ const output = await execCommand(`top -l 1 ${pidArgs} -stats pid,mem 2>/dev/null`);
176
+
177
+ // Parse output: each line is "PID MEM" (e.g., "1438 299M")
178
+ const lines = output.split('\n');
179
+ for (const line of lines) {
180
+ const match = line.trim().match(/^(\d+)\s+([\d.]+)([KMGT])\s*$/);
181
+ if (!match) continue;
182
+
183
+ const pid = parseInt(match[1], 10);
184
+ const value = parseFloat(match[2]);
185
+ const unit = match[3];
186
+
187
+ // Convert to bytes
188
+ const multipliers: { [key: string]: number } = {
189
+ K: 1024,
190
+ M: 1024 * 1024,
191
+ G: 1024 * 1024 * 1024,
192
+ T: 1024 * 1024 * 1024 * 1024,
193
+ };
194
+
195
+ const bytes = Math.round(value * multipliers[unit]);
196
+
197
+ // Cache and store result
198
+ processMemoryCache.set(pid, { value: bytes, timestamp: now });
199
+ result.set(pid, bytes);
200
+ }
201
+
202
+ // For any PIDs that weren't in the output, cache null
203
+ for (const pid of pidsToFetch) {
204
+ if (!result.has(pid)) {
205
+ processMemoryCache.set(pid, { value: null, timestamp: now });
206
+ result.set(pid, null);
207
+ }
208
+ }
209
+
210
+ return result;
211
+ } catch {
212
+ // On error, cache null for all requested PIDs
213
+ for (const pid of pidsToFetch) {
214
+ processMemoryCache.set(pid, { value: null, timestamp: now });
215
+ result.set(pid, null);
216
+ }
217
+ return result;
218
+ }
219
+ }
220
+
221
+ /**
222
+ * Get memory usage for a single process in bytes
223
+ * Uses 'top' on macOS which reports CPU memory only (NOT GPU/Metal memory)
65
224
  * Returns null if process not found or error occurs
225
+ * Caches results for 3 seconds to prevent spawning too many top processes
226
+ *
227
+ * Note: For llama-server processes with GPU offloading, use ServerConfig.metalMemoryMB
228
+ * to get GPU memory allocation (parsed from logs during server startup)
229
+ *
230
+ * Note: For multiple PIDs, use getBatchProcessMemory() instead - much more efficient
66
231
  */
67
232
  export async function getProcessMemory(pid: number): Promise<number | null> {
68
- try {
69
- // Use top with -l 1 (one sample) to get memory stats
70
- // MEM column shows resident memory including GPU memory on macOS
71
- const output = await execCommand(`top -l 1 -pid ${pid} -stats mem`);
233
+ const result = await getBatchProcessMemory([pid]);
234
+ return result.get(pid) ?? null;
235
+ }
236
+
237
+ // Process CPU cache to prevent spawning too many 'ps' processes
238
+ // Cache per PID with 3-second TTL
239
+ const processCpuCache = new Map<number, { value: number | null; timestamp: number }>();
240
+ const PROCESS_CPU_CACHE_TTL = 3000; // 3 seconds
241
+
242
+ /**
243
+ * Batch get CPU usage for multiple processes in one ps call
244
+ * Much more efficient than calling getProcessCpu() multiple times
245
+ * Returns Map<pid, percentage> for all requested PIDs
246
+ */
247
+ export async function getBatchProcessCpu(pids: number[]): Promise<Map<number, number | null>> {
248
+ const result = new Map<number, number | null>();
249
+ const now = Date.now();
72
250
 
73
- // Get the last non-empty line which contains the memory value
74
- const lines = output.split('\n').filter((line) => line.trim().length > 0);
75
- if (lines.length === 0) return null;
251
+ // Check cache and collect PIDs that need fetching
252
+ const pidsToFetch: number[] = [];
253
+ for (const pid of pids) {
254
+ const cached = processCpuCache.get(pid);
255
+ if (cached && (now - cached.timestamp) < PROCESS_CPU_CACHE_TTL) {
256
+ result.set(pid, cached.value);
257
+ } else {
258
+ pidsToFetch.push(pid);
259
+ }
260
+ }
76
261
 
77
- const memStr = lines[lines.length - 1].trim();
262
+ // If all PIDs were cached, return early
263
+ if (pidsToFetch.length === 0) {
264
+ return result;
265
+ }
78
266
 
79
- // Parse memory string (e.g., "10.5G", "512M", "1024K", "10G")
80
- const match = memStr.match(/^([\d.]+)([KMGT])$/);
81
- if (!match) return null;
267
+ try {
268
+ // Build ps command with all PIDs: ps -p X,Y,Z -o pid=,%cpu=
269
+ const pidList = pidsToFetch.join(',');
270
+ const output = await execCommand(`ps -p ${pidList} -o pid=,%cpu= 2>/dev/null`);
82
271
 
83
- const value = parseFloat(match[1]);
84
- const unit = match[2];
272
+ // Parse output: each line is "PID %CPU" (e.g., "1438 45.2")
273
+ const lines = output.split('\n');
274
+ for (const line of lines) {
275
+ const match = line.trim().match(/^(\d+)\s+([\d.]+)\s*$/);
276
+ if (!match) continue;
85
277
 
86
- // Convert to bytes
87
- const multipliers: { [key: string]: number } = {
88
- K: 1024,
89
- M: 1024 * 1024,
90
- G: 1024 * 1024 * 1024,
91
- T: 1024 * 1024 * 1024 * 1024,
92
- };
278
+ const pid = parseInt(match[1], 10);
279
+ const cpuPercent = parseFloat(match[2]);
280
+
281
+ // Cache and store result
282
+ processCpuCache.set(pid, { value: cpuPercent, timestamp: now });
283
+ result.set(pid, cpuPercent);
284
+ }
93
285
 
94
- return Math.round(value * multipliers[unit]);
286
+ // For any PIDs that weren't in the output, cache null (process not running)
287
+ for (const pid of pidsToFetch) {
288
+ if (!result.has(pid)) {
289
+ processCpuCache.set(pid, { value: null, timestamp: now });
290
+ result.set(pid, null);
291
+ }
292
+ }
293
+
294
+ return result;
95
295
  } catch {
96
- return null;
296
+ // On error, cache null for all requested PIDs
297
+ for (const pid of pidsToFetch) {
298
+ processCpuCache.set(pid, { value: null, timestamp: now });
299
+ result.set(pid, null);
300
+ }
301
+ return result;
97
302
  }
98
303
  }
304
+
305
+ /**
306
+ * Get CPU usage for a single process as percentage (0-100+)
307
+ * Uses 'ps -o %cpu' on macOS
308
+ * Returns null if process not found or error occurs
309
+ * Caches results for 3 seconds to prevent spawning too many ps processes
310
+ *
311
+ * Note: For multiple PIDs, use getBatchProcessCpu() instead - much more efficient
312
+ */
313
+ export async function getProcessCpu(pid: number): Promise<number | null> {
314
+ const result = await getBatchProcessCpu([pid]);
315
+ return result.get(pid) ?? null;
316
+ }