@auxot/worker-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +156 -0
- package/dist/capabilities.js +125 -0
- package/dist/debug.js +54 -0
- package/dist/gpu-detection.js +171 -0
- package/dist/gpu-id.js +48 -0
- package/dist/index.js +341 -0
- package/dist/llama-binary.js +287 -0
- package/dist/llama-process.js +203 -0
- package/dist/llama.js +207 -0
- package/dist/model-downloader.js +145 -0
- package/dist/model-resolver.js +80 -0
- package/dist/policy-validator.js +242 -0
- package/dist/types.js +4 -0
- package/dist/websocket.js +433 -0
- package/package.json +57 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Auxot GPU Worker CLI
|
|
4
|
+
*
|
|
5
|
+
* Connects local GPU resources (via llama.cpp) to the Auxot platform.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* auxot-worker --gpu-key gpu.xxx.yyy [--llama-url http://localhost:8000] [--auxot-url https://auxot.com]
|
|
9
|
+
*/
|
|
10
|
+
import { getOrCreateGpuId } from './gpu-id.js';
|
|
11
|
+
import { discoverCapabilities } from './capabilities.js';
|
|
12
|
+
import { WebSocketConnection } from './websocket.js';
|
|
13
|
+
import { processJob } from './llama.js';
|
|
14
|
+
import { setDebugLevel } from './debug.js';
|
|
15
|
+
import { spawnLlamaCpp, waitForLlamaReady } from './llama-process.js';
|
|
16
|
+
import { ensureModelDownloaded } from './model-resolver.js';
|
|
17
|
+
import { ensureLlamaBinary } from './llama-binary.js';
|
|
18
|
+
// Parse command line arguments
|
|
19
|
+
const args = process.argv.slice(2);
|
|
20
|
+
const config = {
|
|
21
|
+
gpuKey: process.env.AUXOT_GPU_KEY || '',
|
|
22
|
+
llamaUrl: process.env.AUXOT_LLAMA_URL || 'http://localhost:9002',
|
|
23
|
+
auxotUrl: process.env.AUXOT_URL || 'http://auxot.localhost:9000',
|
|
24
|
+
debugLevel: 0,
|
|
25
|
+
};
|
|
26
|
+
for (let i = 0; i < args.length; i++) {
|
|
27
|
+
const arg = args[i];
|
|
28
|
+
if (arg === '--gpu-key' && i + 1 < args.length) {
|
|
29
|
+
config.gpuKey = args[++i];
|
|
30
|
+
}
|
|
31
|
+
else if (arg === '--llama-url' && i + 1 < args.length) {
|
|
32
|
+
config.llamaUrl = args[++i];
|
|
33
|
+
}
|
|
34
|
+
else if (arg === '--auxot-url' && i + 1 < args.length) {
|
|
35
|
+
config.auxotUrl = args[++i];
|
|
36
|
+
}
|
|
37
|
+
else if (arg === '--debug') {
|
|
38
|
+
// Check if next arg is a number (debug level)
|
|
39
|
+
if (i + 1 < args.length && /^[12]$/.test(args[i + 1])) {
|
|
40
|
+
config.debugLevel = parseInt(args[++i], 10);
|
|
41
|
+
}
|
|
42
|
+
else {
|
|
43
|
+
config.debugLevel = 1; // Default to level 1
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
else if (arg === '--help' || arg === '-h') {
|
|
47
|
+
console.log(`
|
|
48
|
+
Auxot GPU Worker CLI
|
|
49
|
+
|
|
50
|
+
Usage:
|
|
51
|
+
auxot-worker [options]
|
|
52
|
+
|
|
53
|
+
Options:
|
|
54
|
+
--gpu-key <key> GPU authentication key (format: gpu.xxx.yyy)
|
|
55
|
+
(or set AUXOT_GPU_KEY env var)
|
|
56
|
+
--llama-url <url> llama.cpp server URL (default: http://localhost:8000)
|
|
57
|
+
(or set AUXOT_LLAMA_URL env var)
|
|
58
|
+
--auxot-url <url> Auxot platform URL (default: https://auxot.com)
|
|
59
|
+
(or set AUXOT_URL env var)
|
|
60
|
+
--debug [level] Enable debug logging (level 1 or 2, default: 1)
|
|
61
|
+
Level 1: WebSocket messages (CLI <-> server)
|
|
62
|
+
Level 2: Level 1 + llama.cpp messages
|
|
63
|
+
--help, -h Show this help message
|
|
64
|
+
|
|
65
|
+
Environment Variables:
|
|
66
|
+
AUXOT_GPU_KEY GPU authentication key (overridden by --gpu-key)
|
|
67
|
+
AUXOT_LLAMA_URL llama.cpp server URL (overridden by --llama-url)
|
|
68
|
+
AUXOT_URL Auxot platform URL (overridden by --auxot-url)
|
|
69
|
+
`.trim());
|
|
70
|
+
process.exit(0);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
// Validate required arguments
|
|
74
|
+
if (!config.gpuKey) {
|
|
75
|
+
console.error('Error: GPU key is required');
|
|
76
|
+
console.error('Set AUXOT_GPU_KEY environment variable or use --gpu-key flag');
|
|
77
|
+
console.error('Run with --help for usage information');
|
|
78
|
+
process.exit(1);
|
|
79
|
+
}
|
|
80
|
+
if (!config.gpuKey.startsWith('gpu.')) {
|
|
81
|
+
console.error('Error: GPU key must start with "gpu."');
|
|
82
|
+
process.exit(1);
|
|
83
|
+
}
|
|
84
|
+
async function main() {
|
|
85
|
+
// Set debug level before any operations
|
|
86
|
+
setDebugLevel(config.debugLevel);
|
|
87
|
+
console.log('Auxot GPU Worker CLI');
|
|
88
|
+
console.log('====================');
|
|
89
|
+
if (config.debugLevel > 0) {
|
|
90
|
+
console.log(`Debug Level: ${config.debugLevel}`);
|
|
91
|
+
}
|
|
92
|
+
console.log();
|
|
93
|
+
try {
|
|
94
|
+
// 1. Get GPU ID
|
|
95
|
+
console.log('[1/4] Loading GPU ID...');
|
|
96
|
+
const gpuId = await getOrCreateGpuId();
|
|
97
|
+
console.log(`GPU ID: ${gpuId}`);
|
|
98
|
+
console.log();
|
|
99
|
+
// 2. Connect to WebSocket server (server will send policy)
|
|
100
|
+
console.log('[2/4] Connecting to Auxot platform...');
|
|
101
|
+
const wsUrl = config.auxotUrl.replace(/^http/, 'ws').replace(/^https/, 'wss');
|
|
102
|
+
// Create connection with placeholder capabilities (will be updated after spawning llama.cpp)
|
|
103
|
+
const placeholderCapabilities = {
|
|
104
|
+
model: 'pending',
|
|
105
|
+
ctx_size: 0,
|
|
106
|
+
backend: 'cpu', // Placeholder - will be updated after discovery
|
|
107
|
+
};
|
|
108
|
+
const wsConnection = new WebSocketConnection(config.gpuKey, placeholderCapabilities);
|
|
109
|
+
// Track llama.cpp process for cleanup
|
|
110
|
+
let llamaProcess = null;
|
|
111
|
+
// Track active jobs with abort controllers for cancellation
|
|
112
|
+
const activeJobs = new Map();
|
|
113
|
+
// Register job handler BEFORE connecting to avoid race condition
|
|
114
|
+
wsConnection.onJob(async (job) => {
|
|
115
|
+
// Create AbortController for this job
|
|
116
|
+
const abortController = new AbortController();
|
|
117
|
+
activeJobs.set(job.job_id, abortController);
|
|
118
|
+
try {
|
|
119
|
+
// Get current capabilities from WebSocket connection (updated after policy callback)
|
|
120
|
+
const currentCapabilities = wsConnection.getCapabilities();
|
|
121
|
+
const result = await processJob(job, 'http://127.0.0.1:9002', // Always use local llama.cpp (spawned by worker-cli)
|
|
122
|
+
currentCapabilities, // Pass capabilities for max_tokens_default
|
|
123
|
+
abortController.signal, // Pass abort signal
|
|
124
|
+
(token) => {
|
|
125
|
+
// Stream token to server
|
|
126
|
+
wsConnection.sendToken(job.job_id, token);
|
|
127
|
+
});
|
|
128
|
+
// Check if job was cancelled
|
|
129
|
+
const wasCancelled = abortController.signal.aborted;
|
|
130
|
+
// Send completion with stats (including partial response if cancelled)
|
|
131
|
+
wsConnection.sendComplete(job.job_id, result.fullResponse, result.durationMs, result.inputTokens, result.outputTokens, result.tool_calls);
|
|
132
|
+
if (wasCancelled) {
|
|
133
|
+
console.log(`✓ Job ${job.job_id} cancelled - sent partial response`);
|
|
134
|
+
// Also send error to mark job as cancelled in database
|
|
135
|
+
wsConnection.sendError(job.job_id, 'Job cancelled by user');
|
|
136
|
+
}
|
|
137
|
+
else {
|
|
138
|
+
console.log(`✓ Job ${job.job_id} completed successfully`);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
catch (error) {
|
|
142
|
+
console.error(`✗ Job ${job.job_id} failed:`, error);
|
|
143
|
+
// Send error for actual errors (not cancellation)
|
|
144
|
+
wsConnection.sendError(job.job_id, error instanceof Error ? error.message : 'Unknown error');
|
|
145
|
+
}
|
|
146
|
+
finally {
|
|
147
|
+
// Clean up abort controller
|
|
148
|
+
activeJobs.delete(job.job_id);
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
// Register policy callback (called when policy is received from server)
|
|
152
|
+
wsConnection.onPolicy(async (policy) => {
|
|
153
|
+
console.log('[3/4] Setting up llama.cpp...');
|
|
154
|
+
console.log(` Policy: ${policy.model_name} (${policy.quantization})`);
|
|
155
|
+
console.log(` Context size: ${policy.context_size}`);
|
|
156
|
+
console.log(` Max parallelism: ${policy.max_parallelism}`);
|
|
157
|
+
try {
|
|
158
|
+
// 1. Download model if needed
|
|
159
|
+
console.log(' Downloading/checking model...');
|
|
160
|
+
const modelPath = await ensureModelDownloaded(policy);
|
|
161
|
+
if (!modelPath) {
|
|
162
|
+
throw new Error(`Model not found in registry: ${policy.model_name} (${policy.quantization})`);
|
|
163
|
+
}
|
|
164
|
+
console.log(` ✓ Model ready: ${modelPath}`);
|
|
165
|
+
// 2. Ensure llama.cpp binary is available
|
|
166
|
+
console.log(' Downloading/checking llama.cpp binary...');
|
|
167
|
+
const binaryPath = await ensureLlamaBinary();
|
|
168
|
+
console.log(` ✓ Binary ready: ${binaryPath}`);
|
|
169
|
+
// 3. Determine GPU layers (use all layers for GPU acceleration)
|
|
170
|
+
// For 14B models, we typically want all layers on GPU for best performance
|
|
171
|
+
// Setting to a high number (9999) will use all available layers
|
|
172
|
+
const gpuLayers = 9999; // Use all layers on GPU for maximum performance
|
|
173
|
+
// 3. Spawn llama.cpp process
|
|
174
|
+
console.log(' Spawning llama.cpp process...');
|
|
175
|
+
llamaProcess = await spawnLlamaCpp({
|
|
176
|
+
binaryPath,
|
|
177
|
+
modelPath,
|
|
178
|
+
contextSize: policy.context_size,
|
|
179
|
+
parallelism: policy.max_parallelism,
|
|
180
|
+
port: 9002,
|
|
181
|
+
host: '127.0.0.1',
|
|
182
|
+
gpuLayers, // Enable GPU acceleration
|
|
183
|
+
});
|
|
184
|
+
// Register crash handler for auto-restart
|
|
185
|
+
const setupCrashHandler = (proc) => {
|
|
186
|
+
proc.onCrash(async (code, signal) => {
|
|
187
|
+
console.error(`\n[llama.cpp] Process crashed (code: ${code}, signal: ${signal})`);
|
|
188
|
+
console.log('[llama.cpp] Attempting to restart...');
|
|
189
|
+
try {
|
|
190
|
+
// Wait a bit before restarting to avoid rapid restart loops
|
|
191
|
+
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
192
|
+
// Restart the process
|
|
193
|
+
if (llamaProcess) {
|
|
194
|
+
const restarted = await llamaProcess.restart();
|
|
195
|
+
llamaProcess = restarted;
|
|
196
|
+
// Re-register crash handler on new process
|
|
197
|
+
setupCrashHandler(restarted);
|
|
198
|
+
}
|
|
199
|
+
// Wait for it to be ready
|
|
200
|
+
await waitForLlamaReady('http://127.0.0.1:9002');
|
|
201
|
+
console.log('[llama.cpp] ✓ Restarted successfully');
|
|
202
|
+
// Re-discover capabilities and update server
|
|
203
|
+
const capabilities = await discoverCapabilities('http://127.0.0.1:9002');
|
|
204
|
+
wsConnection.updateCapabilities(capabilities);
|
|
205
|
+
wsConnection.sendConfig(capabilities);
|
|
206
|
+
console.log('[llama.cpp] ✓ Capabilities updated after restart');
|
|
207
|
+
}
|
|
208
|
+
catch (restartError) {
|
|
209
|
+
console.error('[llama.cpp] ✗ Failed to restart:', restartError);
|
|
210
|
+
console.error('[llama.cpp] Worker will continue but may not process jobs correctly');
|
|
211
|
+
}
|
|
212
|
+
});
|
|
213
|
+
};
|
|
214
|
+
setupCrashHandler(llamaProcess);
|
|
215
|
+
// 4. Wait for llama.cpp to be ready
|
|
216
|
+
console.log(' Waiting for llama.cpp to be ready...');
|
|
217
|
+
await waitForLlamaReady('http://127.0.0.1:9002');
|
|
218
|
+
console.log(' ✓ llama.cpp is ready');
|
|
219
|
+
// 5. Warm up the model (ensure it's loaded and ready)
|
|
220
|
+
console.log(' Warming up model...');
|
|
221
|
+
try {
|
|
222
|
+
// Send a minimal prompt to ensure model is loaded
|
|
223
|
+
const warmupResponse = await fetch('http://127.0.0.1:9002/v1/chat/completions', {
|
|
224
|
+
method: 'POST',
|
|
225
|
+
headers: { 'Content-Type': 'application/json' },
|
|
226
|
+
body: JSON.stringify({
|
|
227
|
+
model: 'placeholder', // Will use default model
|
|
228
|
+
messages: [{ role: 'user', content: 'Hi' }],
|
|
229
|
+
max_tokens: 1, // Just 1 token to warm up
|
|
230
|
+
stream: false,
|
|
231
|
+
}),
|
|
232
|
+
});
|
|
233
|
+
if (warmupResponse.ok) {
|
|
234
|
+
await warmupResponse.json(); // Consume response
|
|
235
|
+
console.log(' ✓ Model warmed up');
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
catch (error) {
|
|
239
|
+
console.warn(' ⚠ Model warm-up failed (non-fatal):', error);
|
|
240
|
+
}
|
|
241
|
+
// 6. Discover actual capabilities
|
|
242
|
+
console.log(' Discovering capabilities...');
|
|
243
|
+
const capabilities = await discoverCapabilities('http://127.0.0.1:9002');
|
|
244
|
+
// 7. Update WebSocket connection with real capabilities
|
|
245
|
+
wsConnection.updateCapabilities(capabilities);
|
|
246
|
+
// 8. Send config message to server for validation (this advertises presence)
|
|
247
|
+
wsConnection.sendConfig(capabilities);
|
|
248
|
+
console.log(' ✓ Capabilities discovered and sent to server');
|
|
249
|
+
}
|
|
250
|
+
catch (error) {
|
|
251
|
+
console.error(' ✗ Failed to setup llama.cpp:', error);
|
|
252
|
+
// Clean up process if spawned
|
|
253
|
+
if (llamaProcess) {
|
|
254
|
+
try {
|
|
255
|
+
llamaProcess.stop();
|
|
256
|
+
}
|
|
257
|
+
catch (cleanupError) {
|
|
258
|
+
// Ignore cleanup errors
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
throw error;
|
|
262
|
+
}
|
|
263
|
+
});
|
|
264
|
+
// Register config ack callback (called when server validates our config)
|
|
265
|
+
wsConnection.onConfigAck((success, error) => {
|
|
266
|
+
if (!success) {
|
|
267
|
+
console.error(' ✗ Server rejected configuration:', error);
|
|
268
|
+
// Stop reconnection and exit on config validation failure
|
|
269
|
+
wsConnection.close();
|
|
270
|
+
if (llamaProcess) {
|
|
271
|
+
try {
|
|
272
|
+
llamaProcess.stop();
|
|
273
|
+
}
|
|
274
|
+
catch (cleanupError) {
|
|
275
|
+
// Ignore cleanup errors
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
process.exit(1);
|
|
279
|
+
}
|
|
280
|
+
console.log(' ✓ Configuration validated by server');
|
|
281
|
+
});
|
|
282
|
+
// Register cancel handler
|
|
283
|
+
wsConnection.onCancel((cancelMessage) => {
|
|
284
|
+
console.log(`\n=== Cancelling job ${cancelMessage.job_id} ===`);
|
|
285
|
+
const abortController = activeJobs.get(cancelMessage.job_id);
|
|
286
|
+
if (abortController) {
|
|
287
|
+
abortController.abort();
|
|
288
|
+
console.log(`Sent abort signal to job ${cancelMessage.job_id}`);
|
|
289
|
+
}
|
|
290
|
+
else {
|
|
291
|
+
console.log(`Job ${cancelMessage.job_id} not found in active jobs (may have already completed)`);
|
|
292
|
+
}
|
|
293
|
+
});
|
|
294
|
+
try {
|
|
295
|
+
await wsConnection.connect(wsUrl);
|
|
296
|
+
console.log();
|
|
297
|
+
}
|
|
298
|
+
catch (error) {
|
|
299
|
+
// Connection failed - could be authentication or policy validation error
|
|
300
|
+
const errorMsg = error instanceof Error ? error.message : 'Unknown error';
|
|
301
|
+
if (errorMsg.includes('Policy validation failed') || errorMsg.includes('policy')) {
|
|
302
|
+
console.error('\n✗ Policy validation failed. Please configure your llama.cpp server to match the GPU key policy.');
|
|
303
|
+
console.error(' See error details above for specific mismatches.');
|
|
304
|
+
}
|
|
305
|
+
else if (errorMsg.includes('GPU key policy not configured')) {
|
|
306
|
+
console.error('\n✗ GPU key policy is not configured.');
|
|
307
|
+
console.error(' Please configure the policy in the web UI before connecting workers.');
|
|
308
|
+
}
|
|
309
|
+
else {
|
|
310
|
+
console.error('\n✗ Connection failed:', errorMsg);
|
|
311
|
+
}
|
|
312
|
+
process.exit(1);
|
|
313
|
+
}
|
|
314
|
+
// 4. Start processing jobs
|
|
315
|
+
console.log('[4/4] Ready to process jobs');
|
|
316
|
+
console.log('Waiting for work assignments...');
|
|
317
|
+
console.log('Press Ctrl+C to stop');
|
|
318
|
+
console.log();
|
|
319
|
+
// Handle graceful shutdown
|
|
320
|
+
const shutdown = () => {
|
|
321
|
+
console.log('\nShutting down...');
|
|
322
|
+
if (llamaProcess) {
|
|
323
|
+
try {
|
|
324
|
+
llamaProcess.stop();
|
|
325
|
+
}
|
|
326
|
+
catch (error) {
|
|
327
|
+
console.error('Error stopping llama.cpp:', error);
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
wsConnection.close();
|
|
331
|
+
process.exit(0);
|
|
332
|
+
};
|
|
333
|
+
process.on('SIGINT', shutdown);
|
|
334
|
+
process.on('SIGTERM', shutdown);
|
|
335
|
+
}
|
|
336
|
+
catch (error) {
|
|
337
|
+
console.error('Fatal error:', error);
|
|
338
|
+
process.exit(1);
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
main();
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* llama.cpp Binary Downloader
|
|
3
|
+
*
|
|
4
|
+
* Downloads and manages llama.cpp server binary from GitHub releases.
|
|
5
|
+
*
|
|
6
|
+
* Features:
|
|
7
|
+
* - Downloads fixed version for compatibility
|
|
8
|
+
* - Platform-specific binaries with GPU acceleration
|
|
9
|
+
* - Extracts binaries from archives
|
|
10
|
+
* - Caches binary in ~/.auxot/llama-server/{platform}-{arch}/
|
|
11
|
+
* - Works offline after first download
|
|
12
|
+
*
|
|
13
|
+
* GPU Support:
|
|
14
|
+
* - macOS: Metal GPU acceleration (built into binaries)
|
|
15
|
+
* - Linux: Vulkan GPU acceleration
|
|
16
|
+
* - Windows: CUDA GPU acceleration (NVIDIA GPUs)
|
|
17
|
+
*/
|
|
18
|
+
import { existsSync, chmodSync, statSync } from 'node:fs';
|
|
19
|
+
import { mkdir, unlink } from 'node:fs/promises';
|
|
20
|
+
import { join } from 'node:path';
|
|
21
|
+
import { homedir } from 'os';
|
|
22
|
+
import { platform, arch } from 'os';
|
|
23
|
+
import { createWriteStream } from 'node:fs';
|
|
24
|
+
import { exec } from 'child_process';
|
|
25
|
+
import { promisify } from 'util';
|
|
26
|
+
import { detectGpuBackend } from './gpu-detection.js';
|
|
27
|
+
const execAsync = promisify(exec);
|
|
28
|
+
// Pin llama.cpp version for compatibility
|
|
29
|
+
// Update this when we want to upgrade to a new version
|
|
30
|
+
const LLAMA_CPP_VERSION = 'b7716'; // Latest stable as of implementation (confirmed via GitHub API)
|
|
31
|
+
const LLAMA_CPP_REPO = 'ggml-org/llama.cpp'; // Repository is ggml-org, not ggerganov
|
|
32
|
+
/**
|
|
33
|
+
* Get platform-specific archive name based on detected GPU hardware
|
|
34
|
+
*
|
|
35
|
+
* GPU variants:
|
|
36
|
+
* - macOS: Metal GPU acceleration (built into standard binaries)
|
|
37
|
+
* - Linux: Vulkan GPU acceleration (AMD/NVIDIA) or CPU fallback
|
|
38
|
+
* - Windows: CUDA 12.4 GPU acceleration (NVIDIA) or CPU fallback
|
|
39
|
+
*/
|
|
40
|
+
async function getArchiveName() {
|
|
41
|
+
const os = platform();
|
|
42
|
+
const architecture = arch();
|
|
43
|
+
const gpuDetection = await detectGpuBackend();
|
|
44
|
+
// Log warning if CPU fallback is used
|
|
45
|
+
if (gpuDetection.warning) {
|
|
46
|
+
console.warn(` ⚠ ${gpuDetection.warning}`);
|
|
47
|
+
}
|
|
48
|
+
// Map Node.js platform/arch to llama.cpp archive names
|
|
49
|
+
if (os === 'darwin') {
|
|
50
|
+
// macOS: Metal GPU acceleration is built into the binaries
|
|
51
|
+
if (architecture === 'arm64') {
|
|
52
|
+
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-macos-arm64.tar.gz`, warning: gpuDetection.warning };
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-macos-x64.tar.gz`, warning: gpuDetection.warning };
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
else if (os === 'linux') {
|
|
59
|
+
if (architecture === 'arm64' || architecture === 'aarch64') {
|
|
60
|
+
throw new Error(`Linux ARM64 binaries not available - please use x64 or build from source`);
|
|
61
|
+
}
|
|
62
|
+
else if (architecture === 's390x') {
|
|
63
|
+
// s390x only has CPU variant
|
|
64
|
+
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-ubuntu-s390x.tar.gz`, warning: gpuDetection.warning };
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
// Linux: Use Vulkan for GPU acceleration, CPU as fallback
|
|
68
|
+
if (gpuDetection.backend === 'cpu') {
|
|
69
|
+
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-ubuntu-x64.tar.gz`, warning: gpuDetection.warning };
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-ubuntu-vulkan-x64.tar.gz`, warning: gpuDetection.warning };
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
else if (os === 'win32') {
|
|
77
|
+
if (architecture === 'arm64') {
|
|
78
|
+
// Windows ARM64 only has CPU variant
|
|
79
|
+
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-win-cpu-arm64.zip`, warning: gpuDetection.warning };
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
// Windows: Use CUDA for GPU acceleration (NVIDIA), CPU as fallback
|
|
83
|
+
if (gpuDetection.backend === 'cpu') {
|
|
84
|
+
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-win-cpu-x64.zip`, warning: gpuDetection.warning };
|
|
85
|
+
}
|
|
86
|
+
else {
|
|
87
|
+
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-win-cuda-12.4-x64.zip`, warning: gpuDetection.warning };
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
throw new Error(`Unsupported platform: ${os} ${architecture}`);
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Get cache directory for llama.cpp binary
|
|
95
|
+
*/
|
|
96
|
+
function getCacheDir() {
|
|
97
|
+
const os = platform();
|
|
98
|
+
const architecture = arch();
|
|
99
|
+
const cacheDir = process.env.AUXOT_LLAMA_CACHE_DIR || join(homedir(), '.auxot', 'llama-server');
|
|
100
|
+
return join(cacheDir, `${os}-${architecture}`);
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Get cached binary path
|
|
104
|
+
* Binary is extracted to: {cacheDir}/llama-{VERSION}/llama-server
|
|
105
|
+
*/
|
|
106
|
+
function getBinaryPath() {
|
|
107
|
+
const cacheDir = getCacheDir();
|
|
108
|
+
return join(cacheDir, `llama-${LLAMA_CPP_VERSION}`, 'llama-server');
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Download and extract llama.cpp binary from GitHub releases
|
|
112
|
+
*/
|
|
113
|
+
export async function downloadLlamaBinary(onProgress) {
|
|
114
|
+
const { archiveName, warning } = await getArchiveName();
|
|
115
|
+
const binaryPath = getBinaryPath();
|
|
116
|
+
const cacheDir = getCacheDir();
|
|
117
|
+
const archivePath = join(cacheDir, archiveName);
|
|
118
|
+
// Log warning if CPU fallback is used
|
|
119
|
+
if (warning) {
|
|
120
|
+
console.warn(` ⚠ ${warning}`);
|
|
121
|
+
}
|
|
122
|
+
// Check if binary already exists
|
|
123
|
+
if (existsSync(binaryPath)) {
|
|
124
|
+
const stats = statSync(binaryPath);
|
|
125
|
+
if (stats.size > 0) {
|
|
126
|
+
console.log(` ✓ llama.cpp binary already cached (${formatBytes(stats.size)})`);
|
|
127
|
+
return binaryPath;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
// Create cache directory if it doesn't exist
|
|
131
|
+
if (!existsSync(cacheDir)) {
|
|
132
|
+
await mkdir(cacheDir, { recursive: true });
|
|
133
|
+
}
|
|
134
|
+
// Build GitHub release download URL
|
|
135
|
+
// Format: https://github.com/{repo}/releases/download/{tag}/{asset}
|
|
136
|
+
const downloadUrl = `https://github.com/${LLAMA_CPP_REPO}/releases/download/${LLAMA_CPP_VERSION}/${archiveName}`;
|
|
137
|
+
console.log(` Downloading llama.cpp binary...`);
|
|
138
|
+
console.log(` Version: ${LLAMA_CPP_VERSION}`);
|
|
139
|
+
console.log(` Platform: ${platform()}-${arch()}`);
|
|
140
|
+
console.log(` Archive: ${archiveName}`);
|
|
141
|
+
// Download archive
|
|
142
|
+
const response = await fetch(downloadUrl);
|
|
143
|
+
if (!response.ok) {
|
|
144
|
+
if (response.status === 404) {
|
|
145
|
+
throw new Error(`Archive not found for platform ${platform()}-${arch()}. Available binaries may not exist for this platform.`);
|
|
146
|
+
}
|
|
147
|
+
throw new Error(`Download failed: ${response.status} ${response.statusText}`);
|
|
148
|
+
}
|
|
149
|
+
const contentLength = response.headers.get('content-length');
|
|
150
|
+
const totalSize = contentLength ? parseInt(contentLength, 10) : 0;
|
|
151
|
+
// Stream response to file
|
|
152
|
+
const reader = response.body?.getReader();
|
|
153
|
+
if (!reader) {
|
|
154
|
+
throw new Error('Response body is not readable');
|
|
155
|
+
}
|
|
156
|
+
const fileStream = createWriteStream(archivePath);
|
|
157
|
+
let downloadedBytes = 0;
|
|
158
|
+
try {
|
|
159
|
+
// Download archive
|
|
160
|
+
while (true) {
|
|
161
|
+
const { done, value } = await reader.read();
|
|
162
|
+
if (done) {
|
|
163
|
+
break;
|
|
164
|
+
}
|
|
165
|
+
fileStream.write(value);
|
|
166
|
+
downloadedBytes += value.length;
|
|
167
|
+
// Report progress
|
|
168
|
+
if (onProgress) {
|
|
169
|
+
onProgress(downloadedBytes, totalSize);
|
|
170
|
+
}
|
|
171
|
+
else if (totalSize > 0) {
|
|
172
|
+
// Simple progress log every 1MB
|
|
173
|
+
if (downloadedBytes % (1024 * 1024) < value.length) {
|
|
174
|
+
const percent = ((downloadedBytes / totalSize) * 100).toFixed(1);
|
|
175
|
+
process.stdout.write(`\r Progress: ${percent}% (${formatBytes(downloadedBytes)} / ${formatBytes(totalSize)})`);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
fileStream.end();
|
|
180
|
+
// Wait for file stream to finish
|
|
181
|
+
await new Promise((resolve, reject) => {
|
|
182
|
+
fileStream.on('finish', resolve);
|
|
183
|
+
fileStream.on('error', reject);
|
|
184
|
+
});
|
|
185
|
+
if (totalSize > 0 && downloadedBytes !== totalSize) {
|
|
186
|
+
throw new Error(`Download incomplete: ${downloadedBytes} bytes downloaded, expected ${totalSize}`);
|
|
187
|
+
}
|
|
188
|
+
if (onProgress) {
|
|
189
|
+
process.stdout.write('\r');
|
|
190
|
+
}
|
|
191
|
+
else {
|
|
192
|
+
process.stdout.write('\r');
|
|
193
|
+
}
|
|
194
|
+
console.log(` ✓ Archive downloaded (${formatBytes(downloadedBytes)})`);
|
|
195
|
+
console.log(` Extracting archive...`);
|
|
196
|
+
// Extract archive
|
|
197
|
+
const os = platform();
|
|
198
|
+
const extractDir = cacheDir;
|
|
199
|
+
try {
|
|
200
|
+
if (archiveName.endsWith('.tar.gz')) {
|
|
201
|
+
// Extract tar.gz (macOS/Linux)
|
|
202
|
+
await execAsync(`tar -xzf "${archivePath}" -C "${extractDir}"`, {
|
|
203
|
+
maxBuffer: 10 * 1024 * 1024, // 10MB buffer
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
else if (archiveName.endsWith('.zip')) {
|
|
207
|
+
// Extract zip (Windows)
|
|
208
|
+
// Use PowerShell Expand-Archive on Windows, or unzip on Unix-like systems (Git Bash, WSL)
|
|
209
|
+
if (os === 'win32') {
|
|
210
|
+
// PowerShell Expand-Archive (native Windows)
|
|
211
|
+
await execAsync(`powershell -Command "Expand-Archive -Path '${archivePath}' -DestinationPath '${extractDir}' -Force"`, {
|
|
212
|
+
maxBuffer: 10 * 1024 * 1024, // 10MB buffer
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
else {
|
|
216
|
+
// Use unzip (available on Unix-like systems, Git Bash, WSL)
|
|
217
|
+
await execAsync(`unzip -q "${archivePath}" -d "${extractDir}"`, {
|
|
218
|
+
maxBuffer: 10 * 1024 * 1024, // 10MB buffer
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
else {
|
|
223
|
+
throw new Error(`Unsupported archive format: ${archiveName}`);
|
|
224
|
+
}
|
|
225
|
+
// Clean up archive
|
|
226
|
+
await unlink(archivePath);
|
|
227
|
+
// Verify binary exists
|
|
228
|
+
if (!existsSync(binaryPath)) {
|
|
229
|
+
throw new Error(`Binary not found after extraction: ${binaryPath}`);
|
|
230
|
+
}
|
|
231
|
+
// Make binary executable (Unix-like systems)
|
|
232
|
+
if (os !== 'win32') {
|
|
233
|
+
chmodSync(binaryPath, 0o755);
|
|
234
|
+
}
|
|
235
|
+
console.log(` ✓ Extraction complete`);
|
|
236
|
+
console.log(` ✓ Binary cached at: ${binaryPath}`);
|
|
237
|
+
return binaryPath;
|
|
238
|
+
}
|
|
239
|
+
catch (extractError) {
|
|
240
|
+
// Clean up archive on error
|
|
241
|
+
if (existsSync(archivePath)) {
|
|
242
|
+
await unlink(archivePath).catch(() => { });
|
|
243
|
+
}
|
|
244
|
+
throw new Error(`Extraction failed: ${extractError instanceof Error ? extractError.message : String(extractError)}`);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
catch (error) {
|
|
248
|
+
fileStream.destroy();
|
|
249
|
+
// Clean up partial download
|
|
250
|
+
if (existsSync(archivePath)) {
|
|
251
|
+
await unlink(archivePath).catch(() => { });
|
|
252
|
+
}
|
|
253
|
+
throw error;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
/**
|
|
257
|
+
* Get llama.cpp binary path, downloading if necessary
|
|
258
|
+
*/
|
|
259
|
+
export async function ensureLlamaBinary(onProgress) {
|
|
260
|
+
const binaryPath = getBinaryPath();
|
|
261
|
+
// Check if binary exists and is valid
|
|
262
|
+
if (existsSync(binaryPath)) {
|
|
263
|
+
const stats = statSync(binaryPath);
|
|
264
|
+
if (stats.size > 0) {
|
|
265
|
+
return binaryPath;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
// Download and extract binary
|
|
269
|
+
return await downloadLlamaBinary(onProgress);
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Get binary path without downloading (for checking if binary exists)
|
|
273
|
+
*/
|
|
274
|
+
export function getLlamaBinaryPath() {
|
|
275
|
+
return getBinaryPath();
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Format bytes to human-readable string
|
|
279
|
+
*/
|
|
280
|
+
function formatBytes(bytes) {
|
|
281
|
+
if (bytes === 0)
|
|
282
|
+
return '0 B';
|
|
283
|
+
const k = 1024;
|
|
284
|
+
const sizes = ['B', 'KB', 'MB', 'GB'];
|
|
285
|
+
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
|
286
|
+
return `${(bytes / Math.pow(k, i)).toFixed(1)} ${sizes[i]}`;
|
|
287
|
+
}
|