@appkit/llamacpp-cli 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +7 -0
  2. package/README.md +52 -14
  3. package/dist/cli.js +8 -2
  4. package/dist/cli.js.map +1 -1
  5. package/dist/commands/logs.d.ts +4 -0
  6. package/dist/commands/logs.d.ts.map +1 -1
  7. package/dist/commands/logs.js +157 -24
  8. package/dist/commands/logs.js.map +1 -1
  9. package/dist/commands/ps.d.ts.map +1 -1
  10. package/dist/commands/ps.js +11 -1
  11. package/dist/commands/ps.js.map +1 -1
  12. package/dist/commands/start.d.ts +2 -0
  13. package/dist/commands/start.d.ts.map +1 -1
  14. package/dist/commands/start.js +4 -0
  15. package/dist/commands/start.js.map +1 -1
  16. package/dist/lib/config-generator.d.ts +2 -0
  17. package/dist/lib/config-generator.d.ts.map +1 -1
  18. package/dist/lib/config-generator.js +6 -0
  19. package/dist/lib/config-generator.js.map +1 -1
  20. package/dist/lib/launchctl-manager.d.ts.map +1 -1
  21. package/dist/lib/launchctl-manager.js +22 -13
  22. package/dist/lib/launchctl-manager.js.map +1 -1
  23. package/dist/types/server-config.d.ts +2 -0
  24. package/dist/types/server-config.d.ts.map +1 -1
  25. package/dist/types/server-config.js.map +1 -1
  26. package/dist/utils/log-parser.d.ts +37 -0
  27. package/dist/utils/log-parser.d.ts.map +1 -0
  28. package/dist/utils/log-parser.js +164 -0
  29. package/dist/utils/log-parser.js.map +1 -0
  30. package/dist/utils/process-utils.d.ts +6 -0
  31. package/dist/utils/process-utils.d.ts.map +1 -1
  32. package/dist/utils/process-utils.js +35 -0
  33. package/dist/utils/process-utils.js.map +1 -1
  34. package/package.json +1 -1
  35. package/src/cli.ts +8 -2
  36. package/src/commands/logs.ts +133 -26
  37. package/src/commands/ps.ts +13 -2
  38. package/src/commands/start.ts +6 -0
  39. package/src/lib/config-generator.ts +8 -0
  40. package/src/lib/launchctl-manager.ts +22 -13
  41. package/src/types/server-config.ts +2 -0
  42. package/src/utils/log-parser.ts +184 -0
  43. package/src/utils/process-utils.ts +38 -0
@@ -15,6 +15,8 @@ interface StartOptions {
15
15
  threads?: number;
16
16
  ctxSize?: number;
17
17
  gpuLayers?: number;
18
+ logVerbosity?: number;
19
+ logTimestamps?: boolean;
18
20
  }
19
21
 
20
22
  export async function startCommand(model: string, options: StartOptions): Promise<void> {
@@ -67,6 +69,8 @@ export async function startCommand(model: string, options: StartOptions): Promis
67
69
  threads: options.threads,
68
70
  ctxSize: options.ctxSize,
69
71
  gpuLayers: options.gpuLayers,
72
+ logVerbosity: options.logVerbosity,
73
+ logTimestamps: options.logTimestamps,
70
74
  };
71
75
 
72
76
  const config = await configGenerator.generateConfig(
@@ -84,6 +88,8 @@ export async function startCommand(model: string, options: StartOptions): Promis
84
88
  console.log(chalk.dim(`Threads: ${config.threads}`));
85
89
  console.log(chalk.dim(`Context Size: ${config.ctxSize}`));
86
90
  console.log(chalk.dim(`GPU Layers: ${config.gpuLayers}`));
91
+ console.log(chalk.dim(`Log Verbosity: ${config.logVerbosity !== undefined ? config.logVerbosity : 'all'}`));
92
+ console.log(chalk.dim(`Log Timestamps: ${config.logTimestamps ? 'enabled' : 'disabled'}`));
87
93
  console.log();
88
94
 
89
95
  // 7. Ensure log directory exists
@@ -11,6 +11,8 @@ export interface ServerOptions {
11
11
  gpuLayers?: number;
12
12
  embeddings?: boolean;
13
13
  jinja?: boolean;
14
+ logVerbosity?: number;
15
+ logTimestamps?: boolean;
14
16
  }
15
17
 
16
18
  export interface SmartDefaults {
@@ -67,6 +69,8 @@ export class ConfigGenerator {
67
69
  const gpuLayers = options?.gpuLayers ?? smartDefaults.gpuLayers;
68
70
  const embeddings = options?.embeddings ?? true;
69
71
  const jinja = options?.jinja ?? true;
72
+ const logVerbosity = options?.logVerbosity; // Default to undefined (log everything), filter at CLI level
73
+ const logTimestamps = options?.logTimestamps ?? true; // Enable timestamps by default
70
74
 
71
75
  // Generate server ID
72
76
  const id = sanitizeModelName(modelName);
@@ -88,6 +92,8 @@ export class ConfigGenerator {
88
92
  gpuLayers,
89
93
  embeddings,
90
94
  jinja,
95
+ logVerbosity,
96
+ logTimestamps,
91
97
  status: 'stopped',
92
98
  createdAt: new Date().toISOString(),
93
99
  plistPath,
@@ -111,6 +117,8 @@ export class ConfigGenerator {
111
117
  gpuLayers: options?.gpuLayers ?? globalConfig.defaults.gpuLayers,
112
118
  embeddings: options?.embeddings ?? true,
113
119
  jinja: options?.jinja ?? true,
120
+ logVerbosity: options?.logVerbosity, // undefined = log everything
121
+ logTimestamps: options?.logTimestamps ?? true,
114
122
  };
115
123
  }
116
124
  }
@@ -16,6 +16,27 @@ export class LaunchctlManager {
16
16
  * Generate plist XML content for a server
17
17
  */
18
18
  generatePlist(config: ServerConfig): string {
19
+ // Build program arguments array
20
+ const args = [
21
+ '/opt/homebrew/bin/llama-server',
22
+ '--model', config.modelPath,
23
+ '--port', config.port.toString(),
24
+ '--threads', config.threads.toString(),
25
+ '--ctx-size', config.ctxSize.toString(),
26
+ '--gpu-layers', config.gpuLayers.toString(),
27
+ ];
28
+
29
+ // Add flags
30
+ if (config.embeddings) args.push('--embeddings');
31
+ if (config.jinja) args.push('--jinja');
32
+ if (config.logVerbosity !== undefined) {
33
+ args.push('--log-verbosity', config.logVerbosity.toString());
34
+ }
35
+ if (config.logTimestamps) args.push('--log-timestamps');
36
+
37
+ // Generate XML array elements
38
+ const argsXml = args.map(arg => ` <string>${arg}</string>`).join('\n');
39
+
19
40
  return `<?xml version="1.0" encoding="UTF-8"?>
20
41
  <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN"
21
42
  "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
@@ -26,19 +47,7 @@ export class LaunchctlManager {
26
47
 
27
48
  <key>ProgramArguments</key>
28
49
  <array>
29
- <string>/opt/homebrew/bin/llama-server</string>
30
- <string>--model</string>
31
- <string>${config.modelPath}</string>
32
- <string>--port</string>
33
- <string>${config.port}</string>
34
- <string>--threads</string>
35
- <string>${config.threads}</string>
36
- <string>--ctx-size</string>
37
- <string>${config.ctxSize}</string>
38
- <string>--gpu-layers</string>
39
- <string>${config.gpuLayers}</string>
40
- <string>--embeddings</string>
41
- <string>--jinja</string>
50
+ ${argsXml}
42
51
  </array>
43
52
 
44
53
  <key>RunAtLoad</key>
@@ -12,6 +12,8 @@ export interface ServerConfig {
12
12
  gpuLayers: number;
13
13
  embeddings: boolean; // Always true
14
14
  jinja: boolean; // Always true
15
+ logVerbosity?: number; // Log verbosity level (0=errors only, 1=warnings, 2=info/HTTP, 9=debug, undefined=all)
16
+ logTimestamps: boolean; // Add timestamps to log messages
15
17
 
16
18
  // State tracking
17
19
  status: ServerStatus;
@@ -0,0 +1,184 @@
1
+ /**
2
+ * Parse and consolidate verbose llama-server logs into compact single-line format
3
+ */
4
+
5
+ interface CompactLogEntry {
6
+ timestamp: string;
7
+ method: string;
8
+ endpoint: string;
9
+ ip: string;
10
+ status: number;
11
+ userMessage: string;
12
+ tokensIn: number;
13
+ tokensOut: number;
14
+ responseTimeMs: number;
15
+ }
16
+
17
+ export class LogParser {
18
+ private buffer: string[] = [];
19
+ private isBuffering = false;
20
+
21
+ /**
22
+ * Process log lines and output compact format
23
+ */
24
+ processLine(line: string, callback: (compactLine: string) => void): void {
25
+ // Check if this is the start of an HTTP request log
26
+ if (line.includes('log_server_r: request: POST')) {
27
+ this.isBuffering = true;
28
+ this.buffer = [line];
29
+ return;
30
+ }
31
+
32
+ // If we're buffering, collect lines
33
+ if (this.isBuffering) {
34
+ this.buffer.push(line);
35
+
36
+ // Check if we have a complete request (found response line)
37
+ if (line.includes('log_server_r: response:')) {
38
+ const compactLine = this.consolidateRequest(this.buffer);
39
+ if (compactLine) {
40
+ callback(compactLine);
41
+ }
42
+ this.buffer = [];
43
+ this.isBuffering = false;
44
+ }
45
+ }
46
+ }
47
+
48
+ /**
49
+ * Consolidate buffered request/response lines into single line
50
+ */
51
+ private consolidateRequest(lines: string[]): string | null {
52
+ try {
53
+ // Parse first line: timestamp and request info
54
+ const firstLine = lines[0];
55
+ const timestamp = this.extractTimestamp(firstLine);
56
+ const requestMatch = firstLine.match(/request: (POST|GET|PUT|DELETE) (\/[^\s]+) ([^\s]+) (\d+)/);
57
+ if (!requestMatch) return null;
58
+
59
+ const [, method, endpoint, ip, status] = requestMatch;
60
+
61
+ // Parse request JSON (second line)
62
+ const requestLine = lines.find((l) => l.includes('log_server_r: request:') && l.includes('{'));
63
+ if (!requestLine) return null;
64
+
65
+ const requestJson = this.extractJson(requestLine);
66
+ if (!requestJson) return null;
67
+
68
+ const userMessage = this.extractUserMessage(requestJson);
69
+
70
+ // Parse response JSON (last line)
71
+ const responseLine = lines.find((l) => l.includes('log_server_r: response:'));
72
+ if (!responseLine) return null;
73
+
74
+ const responseJson = this.extractJson(responseLine);
75
+ if (!responseJson) return null;
76
+
77
+ const tokensIn = responseJson.usage?.prompt_tokens || 0;
78
+ const tokensOut = responseJson.usage?.completion_tokens || 0;
79
+
80
+ // Extract response time from verbose timings
81
+ const responseTimeMs = this.extractResponseTime(responseJson);
82
+
83
+ // Format compact line
84
+ return this.formatCompactLine({
85
+ timestamp,
86
+ method,
87
+ endpoint,
88
+ ip,
89
+ status: parseInt(status, 10),
90
+ userMessage,
91
+ tokensIn,
92
+ tokensOut,
93
+ responseTimeMs,
94
+ });
95
+ } catch (error) {
96
+ return null;
97
+ }
98
+ }
99
+
100
+ /**
101
+ * Extract timestamp from log line
102
+ */
103
+ private extractTimestamp(line: string): string {
104
+ // Look for timestamp format like [2025-12-09 10:13:45]
105
+ const match = line.match(/\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]/);
106
+ if (match) {
107
+ return match[1]; // Return as-is: 2025-12-09 10:13:45
108
+ }
109
+ // If no timestamp in logs, use current time in same format
110
+ const now = new Date();
111
+ return now.toISOString().substring(0, 19).replace('T', ' '); // 2025-12-09 10:13:45
112
+ }
113
+
114
+ /**
115
+ * Extract JSON from log line
116
+ */
117
+ private extractJson(line: string): any {
118
+ const jsonStart = line.indexOf('{');
119
+ if (jsonStart === -1) return null;
120
+
121
+ try {
122
+ const jsonStr = line.substring(jsonStart);
123
+ return JSON.parse(jsonStr);
124
+ } catch {
125
+ return null;
126
+ }
127
+ }
128
+
129
+ /**
130
+ * Extract first user message from request JSON
131
+ */
132
+ private extractUserMessage(requestJson: any): string {
133
+ const messages = requestJson.messages || [];
134
+ const userMsg = messages.find((m: any) => m.role === 'user');
135
+ if (!userMsg || !userMsg.content) return '';
136
+
137
+ // Truncate to first 50 characters
138
+ const content = userMsg.content.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim();
139
+ return content.length > 50 ? content.substring(0, 47) + '...' : content;
140
+ }
141
+
142
+ /**
143
+ * Extract response time from response JSON
144
+ */
145
+ private extractResponseTime(responseJson: any): number {
146
+ // Check __verbose.timings first (has total time)
147
+ const verboseTimings = responseJson.__verbose?.timings;
148
+ if (verboseTimings) {
149
+ const promptMs = verboseTimings.prompt_ms || 0;
150
+ const predictedMs = verboseTimings.predicted_ms || 0;
151
+ return Math.round(promptMs + predictedMs);
152
+ }
153
+
154
+ // Fallback to top-level timings
155
+ const timings = responseJson.timings;
156
+ if (timings) {
157
+ const promptMs = timings.prompt_ms || 0;
158
+ const predictedMs = timings.predicted_ms || 0;
159
+ return Math.round(promptMs + predictedMs);
160
+ }
161
+
162
+ return 0;
163
+ }
164
+
165
+ /**
166
+ * Format compact log line
167
+ */
168
+ private formatCompactLine(entry: CompactLogEntry): string {
169
+ return [
170
+ entry.timestamp,
171
+ entry.method,
172
+ entry.endpoint,
173
+ entry.ip,
174
+ entry.status,
175
+ `"${entry.userMessage}"`,
176
+ entry.tokensIn,
177
+ entry.tokensOut,
178
+ entry.responseTimeMs,
179
+ ].join(' ');
180
+ }
181
+ }
182
+
183
+ // Export singleton instance
184
+ export const logParser = new LogParser();
@@ -58,3 +58,41 @@ export async function isPortInUse(port: number): Promise<boolean> {
58
58
  return false;
59
59
  }
60
60
  }
61
+
62
+ /**
63
+ * Get memory usage for a process in bytes
64
+ * Uses 'top' on macOS which includes GPU/Metal memory (more accurate for llama-server)
65
+ * Returns null if process not found or error occurs
66
+ */
67
+ export async function getProcessMemory(pid: number): Promise<number | null> {
68
+ try {
69
+ // Use top with -l 1 (one sample) to get memory stats
70
+ // MEM column shows resident memory including GPU memory on macOS
71
+ const output = await execCommand(`top -l 1 -pid ${pid} -stats mem`);
72
+
73
+ // Get the last non-empty line which contains the memory value
74
+ const lines = output.split('\n').filter((line) => line.trim().length > 0);
75
+ if (lines.length === 0) return null;
76
+
77
+ const memStr = lines[lines.length - 1].trim();
78
+
79
+ // Parse memory string (e.g., "10.5G", "512M", "1024K", "10G")
80
+ const match = memStr.match(/^([\d.]+)([KMGT])$/);
81
+ if (!match) return null;
82
+
83
+ const value = parseFloat(match[1]);
84
+ const unit = match[2];
85
+
86
+ // Convert to bytes
87
+ const multipliers: { [key: string]: number } = {
88
+ K: 1024,
89
+ M: 1024 * 1024,
90
+ G: 1024 * 1024 * 1024,
91
+ T: 1024 * 1024 * 1024 * 1024,
92
+ };
93
+
94
+ return Math.round(value * multipliers[unit]);
95
+ } catch {
96
+ return null;
97
+ }
98
+ }