@appkit/llamacpp-cli 1.4.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/README.md +87 -1
  3. package/dist/cli.js +14 -0
  4. package/dist/cli.js.map +1 -1
  5. package/dist/commands/monitor.d.ts +2 -0
  6. package/dist/commands/monitor.d.ts.map +1 -0
  7. package/dist/commands/monitor.js +76 -0
  8. package/dist/commands/monitor.js.map +1 -0
  9. package/dist/lib/metrics-aggregator.d.ts +39 -0
  10. package/dist/lib/metrics-aggregator.d.ts.map +1 -0
  11. package/dist/lib/metrics-aggregator.js +200 -0
  12. package/dist/lib/metrics-aggregator.js.map +1 -0
  13. package/dist/lib/system-collector.d.ts +75 -0
  14. package/dist/lib/system-collector.d.ts.map +1 -0
  15. package/dist/lib/system-collector.js +310 -0
  16. package/dist/lib/system-collector.js.map +1 -0
  17. package/dist/tui/MonitorApp.d.ts +4 -0
  18. package/dist/tui/MonitorApp.d.ts.map +1 -0
  19. package/dist/tui/MonitorApp.js +293 -0
  20. package/dist/tui/MonitorApp.js.map +1 -0
  21. package/dist/tui/MultiServerMonitorApp.d.ts +4 -0
  22. package/dist/tui/MultiServerMonitorApp.d.ts.map +1 -0
  23. package/dist/tui/MultiServerMonitorApp.js +496 -0
  24. package/dist/tui/MultiServerMonitorApp.js.map +1 -0
  25. package/dist/tui/components/ErrorState.d.ts +8 -0
  26. package/dist/tui/components/ErrorState.d.ts.map +1 -0
  27. package/dist/tui/components/ErrorState.js +22 -0
  28. package/dist/tui/components/ErrorState.js.map +1 -0
  29. package/dist/tui/components/LoadingState.d.ts +8 -0
  30. package/dist/tui/components/LoadingState.d.ts.map +1 -0
  31. package/dist/tui/components/LoadingState.js +21 -0
  32. package/dist/tui/components/LoadingState.js.map +1 -0
  33. package/dist/types/monitor-types.d.ts +122 -0
  34. package/dist/types/monitor-types.d.ts.map +1 -0
  35. package/dist/types/monitor-types.js +3 -0
  36. package/dist/types/monitor-types.js.map +1 -0
  37. package/dist/utils/process-utils.d.ts +16 -1
  38. package/dist/utils/process-utils.d.ts.map +1 -1
  39. package/dist/utils/process-utils.js +144 -27
  40. package/dist/utils/process-utils.js.map +1 -1
  41. package/package.json +3 -1
  42. package/src/cli.ts +14 -0
  43. package/src/commands/monitor.ts +90 -0
  44. package/src/lib/metrics-aggregator.ts +244 -0
  45. package/src/lib/system-collector.ts +312 -0
  46. package/src/tui/MonitorApp.ts +361 -0
  47. package/src/tui/MultiServerMonitorApp.ts +547 -0
  48. package/src/types/monitor-types.ts +161 -0
  49. package/src/utils/process-utils.ts +160 -26
@@ -0,0 +1,244 @@
1
+ import { ServerConfig } from '../types/server-config.js';
2
+ import { ServerMetrics, SlotInfo, MonitorData } from '../types/monitor-types.js';
3
+ import { statusChecker } from './status-checker.js';
4
+ import { systemCollector } from './system-collector.js';
5
+ import { getProcessMemory } from '../utils/process-utils.js';
6
+
7
+ /**
8
+ * Aggregates metrics from llama.cpp server API endpoints
9
+ * Combines server health, slot status, and model properties
10
+ */
11
+ export class MetricsAggregator {
12
+ private serverUrl: string;
13
+ private timeout: number;
14
+ private previousSlots: Map<number, { n_decoded: number; timestamp: number }> = new Map();
15
+
16
+ constructor(server: ServerConfig, timeout: number = 5000) {
17
+ // Handle null host (legacy configs) by defaulting to 127.0.0.1
18
+ const host = server.host || '127.0.0.1';
19
+ this.serverUrl = `http://${host}:${server.port}`;
20
+ this.timeout = timeout;
21
+ }
22
+
23
+ /**
24
+ * Fetch data from llama.cpp API with timeout
25
+ */
26
+ private async fetchWithTimeout(
27
+ endpoint: string,
28
+ customTimeout?: number
29
+ ): Promise<any | null> {
30
+ try {
31
+ const controller = new AbortController();
32
+ const timeoutMs = customTimeout ?? this.timeout;
33
+ const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
34
+
35
+ const response = await fetch(`${this.serverUrl}${endpoint}`, {
36
+ signal: controller.signal,
37
+ });
38
+
39
+ clearTimeout(timeoutId);
40
+
41
+ if (!response.ok) {
42
+ return null;
43
+ }
44
+
45
+ return await response.json();
46
+ } catch (err) {
47
+ // Network error, timeout, or parse error
48
+ return null;
49
+ }
50
+ }
51
+
52
+ /**
53
+ * Get server health status
54
+ */
55
+ private async getHealth(): Promise<boolean> {
56
+ const health = await this.fetchWithTimeout('/health');
57
+ return health !== null && health.status === 'ok';
58
+ }
59
+
60
+ /**
61
+ * Get server properties (model info, context size, etc.)
62
+ */
63
+ private async getProps(): Promise<any> {
64
+ return await this.fetchWithTimeout('/props');
65
+ }
66
+
67
+ /**
68
+ * Get active slots information with calculated tok/s
69
+ */
70
+ private async getSlots(): Promise<SlotInfo[]> {
71
+ const data = await this.fetchWithTimeout('/slots');
72
+ if (!data || !Array.isArray(data)) {
73
+ return [];
74
+ }
75
+
76
+ const now = Date.now();
77
+
78
+ return data.map((slot: any) => {
79
+ const slotId = slot.id;
80
+ const n_decoded = slot.next_token?.[0]?.n_decoded || 0;
81
+ const isProcessing = slot.is_processing;
82
+
83
+ // Calculate tokens per second by comparing with previous poll
84
+ let predicted_per_second: number | undefined;
85
+
86
+ if (isProcessing && n_decoded > 0) {
87
+ const previous = this.previousSlots.get(slotId);
88
+
89
+ if (previous && previous.n_decoded < n_decoded) {
90
+ const tokensGenerated = n_decoded - previous.n_decoded;
91
+ const timeElapsed = (now - previous.timestamp) / 1000; // Convert to seconds
92
+
93
+ if (timeElapsed > 0) {
94
+ predicted_per_second = tokensGenerated / timeElapsed;
95
+ }
96
+ }
97
+
98
+ // Store current state for next comparison
99
+ this.previousSlots.set(slotId, { n_decoded, timestamp: now });
100
+ } else if (!isProcessing) {
101
+ // Clear history when slot becomes idle
102
+ this.previousSlots.delete(slotId);
103
+ }
104
+
105
+ return {
106
+ id: slotId,
107
+ state: isProcessing ? 'processing' : 'idle',
108
+ n_prompt_tokens: slot.n_prompt_tokens,
109
+ n_decoded,
110
+ n_ctx: slot.n_ctx || 0,
111
+ timings: predicted_per_second
112
+ ? {
113
+ prompt_n: 0,
114
+ prompt_ms: 0,
115
+ prompt_per_token_ms: 0,
116
+ prompt_per_second: 0,
117
+ predicted_n: n_decoded,
118
+ predicted_ms: 0,
119
+ predicted_per_token_ms: 0,
120
+ predicted_per_second,
121
+ }
122
+ : undefined,
123
+ };
124
+ });
125
+ }
126
+
127
+ /**
128
+ * Aggregate all server metrics
129
+ * @param server - Server configuration
130
+ * @param processMemory - Optional pre-fetched process memory (for batch collection)
131
+ */
132
+ async collectServerMetrics(
133
+ server: ServerConfig,
134
+ processMemory?: number | null
135
+ ): Promise<ServerMetrics> {
136
+ const now = Date.now();
137
+
138
+ // Check basic server status first
139
+ const status = await statusChecker.checkServer(server);
140
+
141
+ // Calculate uptime if server is running and has lastStarted
142
+ let uptime: string | undefined;
143
+ if (status.isRunning && server.lastStarted) {
144
+ const startTime = new Date(server.lastStarted).getTime();
145
+ const uptimeSeconds = Math.floor((now - startTime) / 1000);
146
+ const hours = Math.floor(uptimeSeconds / 3600);
147
+ const minutes = Math.floor((uptimeSeconds % 3600) / 60);
148
+ const seconds = uptimeSeconds % 60;
149
+ uptime = `${hours}h ${minutes}m ${seconds}s`;
150
+ }
151
+
152
+ // If server not running, return minimal data
153
+ if (!status.isRunning) {
154
+ return {
155
+ server,
156
+ healthy: false,
157
+ modelLoaded: false,
158
+ modelName: server.modelName,
159
+ contextSize: server.ctxSize,
160
+ totalSlots: 0,
161
+ activeSlots: 0,
162
+ idleSlots: 0,
163
+ slots: [],
164
+ timestamp: now,
165
+ stale: false,
166
+ };
167
+ }
168
+
169
+ // Fetch detailed metrics in parallel
170
+ // If processMemory was pre-fetched (batch mode), use it; otherwise fetch individually
171
+ const [healthy, props, slots, fetchedMemory] = await Promise.all([
172
+ this.getHealth(),
173
+ this.getProps(),
174
+ this.getSlots(),
175
+ processMemory !== undefined
176
+ ? Promise.resolve(processMemory)
177
+ : (server.pid ? getProcessMemory(server.pid) : Promise.resolve(null)),
178
+ ]);
179
+
180
+ // Calculate slot statistics
181
+ const activeSlots = slots.filter((s) => s.state === 'processing').length;
182
+ const idleSlots = slots.filter((s) => s.state === 'idle').length;
183
+ const totalSlots = props?.total_slots || slots.length;
184
+
185
+ // Calculate average speeds (only from processing slots)
186
+ const processingSlots = slots.filter((s) => s.state === 'processing' && s.timings);
187
+
188
+ const avgPromptSpeed =
189
+ processingSlots.length > 0
190
+ ? processingSlots.reduce(
191
+ (sum, s) => sum + (s.timings?.prompt_per_second || 0),
192
+ 0
193
+ ) / processingSlots.length
194
+ : undefined;
195
+
196
+ const avgGenerateSpeed =
197
+ processingSlots.length > 0
198
+ ? processingSlots.reduce(
199
+ (sum, s) => sum + (s.timings?.predicted_per_second || 0),
200
+ 0
201
+ ) / processingSlots.length
202
+ : undefined;
203
+
204
+ return {
205
+ server,
206
+ healthy,
207
+ uptime,
208
+ modelLoaded: props !== null,
209
+ modelName: server.modelName,
210
+ contextSize: props?.default_generation_settings?.n_ctx || server.ctxSize,
211
+ totalSlots,
212
+ activeSlots,
213
+ idleSlots,
214
+ slots,
215
+ avgPromptSpeed,
216
+ avgGenerateSpeed,
217
+ processMemory: fetchedMemory ?? undefined,
218
+ timestamp: now,
219
+ stale: false,
220
+ };
221
+ }
222
+
223
+ /**
224
+ * Collect complete monitoring data (server + system metrics)
225
+ */
226
+ async collectMonitorData(
227
+ server: ServerConfig,
228
+ updateInterval: number = 2000
229
+ ): Promise<MonitorData> {
230
+ // Collect server and system metrics in parallel
231
+ const [serverMetrics, systemMetrics] = await Promise.all([
232
+ this.collectServerMetrics(server),
233
+ systemCollector.collectSystemMetrics(),
234
+ ]);
235
+
236
+ return {
237
+ server: serverMetrics,
238
+ system: systemMetrics,
239
+ lastUpdated: new Date(),
240
+ updateInterval,
241
+ consecutiveFailures: 0,
242
+ };
243
+ }
244
+ }
@@ -0,0 +1,312 @@
1
+ import { execCommand, spawnAndReadOneLine } from '../utils/process-utils.js';
2
+ import { SystemMetrics } from '../types/monitor-types.js';
3
+
4
+ /**
5
+ * System metrics collector using macmon (optional) and vm_stat (fallback)
6
+ * Provides GPU, CPU, ANE, and memory metrics on macOS
7
+ */
8
+ export class SystemCollector {
9
+ private macmonPath: string;
10
+ private macmonAvailable: boolean | null = null;
11
+ private lastSystemMetrics: SystemMetrics | null = null;
12
+ private lastCollectionTime: number = 0;
13
+ private readonly CACHE_TTL_MS = 4000; // Cache for 4 seconds (longer than macmon spawn time)
14
+ private collectingLock: Promise<SystemMetrics> | null = null;
15
+ private pCoreCount: number = 0;
16
+ private eCoreCount: number = 0;
17
+ private totalCores: number = 0;
18
+
19
+ constructor(macmonPath: string = '/opt/homebrew/bin/macmon') {
20
+ this.macmonPath = macmonPath;
21
+ this.initializeCoreCount();
22
+ }
23
+
24
+ /**
25
+ * Get CPU core counts for weighted average calculation
26
+ */
27
+ private async initializeCoreCount(): Promise<void> {
28
+ try {
29
+ const { execCommand } = await import('../utils/process-utils.js');
30
+
31
+ // Try to get P-core and E-core counts separately (Apple Silicon)
32
+ try {
33
+ const pCores = await execCommand('sysctl -n hw.perflevel0.physicalcpu 2>/dev/null');
34
+ const eCores = await execCommand('sysctl -n hw.perflevel1.physicalcpu 2>/dev/null');
35
+ this.pCoreCount = parseInt(pCores, 10) || 0;
36
+ this.eCoreCount = parseInt(eCores, 10) || 0;
37
+ } catch {
38
+ // Fall back to total core count if perflevel not available
39
+ const total = await execCommand('sysctl -n hw.ncpu 2>/dev/null');
40
+ this.totalCores = parseInt(total, 10) || 0;
41
+ // Assume equal split if we can't get individual counts
42
+ this.pCoreCount = Math.floor(this.totalCores / 2);
43
+ this.eCoreCount = this.totalCores - this.pCoreCount;
44
+ }
45
+
46
+ this.totalCores = this.pCoreCount + this.eCoreCount;
47
+ } catch {
48
+ // Default to 8 cores if we can't detect
49
+ this.pCoreCount = 4;
50
+ this.eCoreCount = 4;
51
+ this.totalCores = 8;
52
+ }
53
+ }
54
+
55
+ /**
56
+ * Check if macmon is available
57
+ */
58
+ private async checkMacmonAvailability(): Promise<boolean> {
59
+ if (this.macmonAvailable !== null) {
60
+ return this.macmonAvailable;
61
+ }
62
+
63
+ try {
64
+ const result = await execCommand(`which ${this.macmonPath} 2>/dev/null`);
65
+ this.macmonAvailable = result.length > 0;
66
+ } catch {
67
+ this.macmonAvailable = false;
68
+ }
69
+
70
+ return this.macmonAvailable;
71
+ }
72
+
73
+ /**
74
+ * Parse macmon JSON output
75
+ * Expected format from 'macmon pipe':
76
+ * {
77
+ * "gpu_usage": [count, percentage],
78
+ * "pcpu_usage": [count, percentage],
79
+ * "ecpu_usage": [count, percentage],
80
+ * "ane_power": number,
81
+ * "temp": {"cpu_temp_avg": number, "gpu_temp_avg": number}
82
+ * }
83
+ */
84
+ private parseMacmonJson(jsonLine: string): {
85
+ gpuUsage?: number;
86
+ cpuUsage?: number;
87
+ aneUsage?: number;
88
+ temperature?: number;
89
+ } {
90
+ try {
91
+ const data = JSON.parse(jsonLine);
92
+
93
+ // GPU usage (second element of array, convert decimal to percentage)
94
+ const gpuUsage = data.gpu_usage?.[1] !== undefined
95
+ ? data.gpu_usage[1] * 100
96
+ : undefined;
97
+
98
+ // CPU usage (weighted average of P-cores and E-cores)
99
+ // Each core type reports 0.0-1.0 utilization
100
+ // Calculate weighted average: (P% * Pcount + E% * Ecount) / totalCores
101
+ const pcpuUsage = data.pcpu_usage?.[1] || 0; // 0.0-1.0
102
+ const ecpuUsage = data.ecpu_usage?.[1] || 0; // 0.0-1.0
103
+
104
+ let cpuUsage: number | undefined;
105
+ if (this.totalCores > 0) {
106
+ // Weighted average normalized to 0-100%
107
+ cpuUsage = ((pcpuUsage * this.pCoreCount) + (ecpuUsage * this.eCoreCount)) / this.totalCores * 100;
108
+ } else {
109
+ // Fallback: simple average if core counts not available
110
+ cpuUsage = ((pcpuUsage + ecpuUsage) / 2) * 100;
111
+ }
112
+
113
+ // ANE usage (estimate from power draw - macmon doesn't provide usage %)
114
+ // If ANE power > 0.1W, consider it active (rough estimate)
115
+ const aneUsage = data.ane_power > 0.1
116
+ ? Math.min((data.ane_power / 8.0) * 100, 100) // Assume ~8W max for ANE
117
+ : 0;
118
+
119
+ // Temperature (use GPU temp if available, otherwise CPU)
120
+ const temperature = data.temp?.gpu_temp_avg || data.temp?.cpu_temp_avg;
121
+
122
+ return {
123
+ gpuUsage,
124
+ cpuUsage: cpuUsage > 0 ? cpuUsage : undefined,
125
+ aneUsage: aneUsage > 1 ? aneUsage : undefined,
126
+ temperature,
127
+ };
128
+ } catch {
129
+ return {};
130
+ }
131
+ }
132
+
133
+ /**
134
+ * Collect macmon metrics (GPU, CPU, ANE)
135
+ * Uses 'macmon pipe' which outputs one JSON line per update
136
+ * Spawns macmon, reads one line, and kills it to prevent process leaks
137
+ */
138
+ private async getMacmonMetrics(): Promise<{
139
+ gpuUsage?: number;
140
+ cpuUsage?: number;
141
+ aneUsage?: number;
142
+ temperature?: number;
143
+ } | null> {
144
+ const available = await this.checkMacmonAvailability();
145
+ if (!available) {
146
+ return null;
147
+ }
148
+
149
+ try {
150
+ // Spawn macmon pipe, read one line, and kill it
151
+ // This prevents orphaned macmon processes
152
+ // Timeout set to 5s because macmon can take 3-4s to produce first line
153
+ const output = await spawnAndReadOneLine(this.macmonPath, ['pipe'], 5000);
154
+
155
+ if (!output) {
156
+ return null;
157
+ }
158
+
159
+ return this.parseMacmonJson(output);
160
+ } catch {
161
+ return null;
162
+ }
163
+ }
164
+
165
+ /**
166
+ * Parse vm_stat output for memory metrics
167
+ * Expected format:
168
+ * Pages free: 123456.
169
+ * Pages active: 234567.
170
+ * Pages inactive: 345678.
171
+ * Pages speculative: 45678.
172
+ * Pages throttled: 0.
173
+ * Pages wired down: 123456.
174
+ * Pages purgeable count: 0.
175
+ * "Translation faults": 12345678.
176
+ * Pages copy-on-write: 123456.
177
+ * ...
178
+ */
179
+ private parseVmStatOutput(output: string): {
180
+ memoryUsed: number;
181
+ memoryTotal: number;
182
+ } {
183
+ const lines = output.split('\n');
184
+ const pageSize = 16384; // 16KB on Apple Silicon
185
+ let pagesActive = 0;
186
+ let pagesWired = 0;
187
+ let pagesCompressed = 0;
188
+ let pagesFree = 0;
189
+ let pagesInactive = 0;
190
+ let pagesSpeculative = 0;
191
+
192
+ for (const line of lines) {
193
+ const match = line.match(/Pages (.*?):\s+(\d+)\./);
194
+ if (match) {
195
+ const name = match[1].toLowerCase();
196
+ const value = parseInt(match[2], 10);
197
+
198
+ if (name === 'active') pagesActive = value;
199
+ else if (name === 'wired down') pagesWired = value;
200
+ else if (name === 'compressed') pagesCompressed = value;
201
+ else if (name === 'free') pagesFree = value;
202
+ else if (name === 'inactive') pagesInactive = value;
203
+ else if (name === 'speculative') pagesSpeculative = value;
204
+ }
205
+ }
206
+
207
+ // Calculate used memory (active + wired + compressed)
208
+ const usedPages = pagesActive + pagesWired + pagesCompressed;
209
+ const memoryUsed = usedPages * pageSize;
210
+
211
+ // Calculate total memory (used + free + inactive + speculative)
212
+ const totalPages =
213
+ pagesActive +
214
+ pagesWired +
215
+ pagesCompressed +
216
+ pagesFree +
217
+ pagesInactive +
218
+ pagesSpeculative;
219
+ const memoryTotal = totalPages * pageSize;
220
+
221
+ return { memoryUsed, memoryTotal };
222
+ }
223
+
224
+ /**
225
+ * Collect vm_stat memory metrics
226
+ */
227
+ private async getVmStatMetrics(): Promise<{
228
+ memoryUsed: number;
229
+ memoryTotal: number;
230
+ }> {
231
+ try {
232
+ const output = await execCommand('vm_stat 2>/dev/null');
233
+ return this.parseVmStatOutput(output);
234
+ } catch {
235
+ // Fallback to zeros if vm_stat fails
236
+ return { memoryUsed: 0, memoryTotal: 0 };
237
+ }
238
+ }
239
+
240
+ /**
241
+ * Collect all system metrics
242
+ * Attempts macmon first (GPU/CPU/ANE + memory), falls back to vm_stat (memory only)
243
+ * Caches results for 1.5s to prevent spawning multiple macmon processes
244
+ */
245
+ async collectSystemMetrics(): Promise<SystemMetrics> {
246
+ const now = Date.now();
247
+
248
+ // Return cached data if still fresh
249
+ if (this.lastSystemMetrics && (now - this.lastCollectionTime) < this.CACHE_TTL_MS) {
250
+ return this.lastSystemMetrics;
251
+ }
252
+
253
+ // If already collecting, wait for that to finish
254
+ if (this.collectingLock) {
255
+ return this.collectingLock;
256
+ }
257
+
258
+ // Start fresh collection
259
+ this.collectingLock = this.doCollectSystemMetrics();
260
+
261
+ try {
262
+ const metrics = await this.collectingLock;
263
+ this.lastSystemMetrics = metrics;
264
+ this.lastCollectionTime = now;
265
+ return metrics;
266
+ } finally {
267
+ this.collectingLock = null;
268
+ }
269
+ }
270
+
271
+ /**
272
+ * Internal method to actually collect system metrics
273
+ * Called by collectSystemMetrics with caching/locking
274
+ */
275
+ private async doCollectSystemMetrics(): Promise<SystemMetrics> {
276
+ const warnings: string[] = [];
277
+ const now = Date.now();
278
+
279
+ // Try macmon first
280
+ const macmonMetrics = await this.getMacmonMetrics();
281
+
282
+ // Always get memory from vm_stat (more reliable than macmon)
283
+ const memoryMetrics = await this.getVmStatMetrics();
284
+
285
+ // Determine source and add warnings
286
+ let source: 'macmon' | 'vm_stat' | 'none';
287
+ if (macmonMetrics) {
288
+ source = 'macmon';
289
+ } else if (memoryMetrics.memoryTotal > 0) {
290
+ source = 'vm_stat';
291
+ warnings.push('macmon not available - showing memory metrics only');
292
+ } else {
293
+ source = 'none';
294
+ warnings.push('Unable to collect system metrics');
295
+ }
296
+
297
+ return {
298
+ gpuUsage: macmonMetrics?.gpuUsage,
299
+ cpuUsage: macmonMetrics?.cpuUsage,
300
+ aneUsage: macmonMetrics?.aneUsage,
301
+ temperature: macmonMetrics?.temperature,
302
+ memoryUsed: memoryMetrics.memoryUsed,
303
+ memoryTotal: memoryMetrics.memoryTotal,
304
+ timestamp: now,
305
+ source,
306
+ warnings: warnings.length > 0 ? warnings : undefined,
307
+ };
308
+ }
309
+ }
310
+
311
+ // Export singleton instance
312
+ export const systemCollector = new SystemCollector();