@auxot/worker-cli 0.1.0 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +5897 -297
- package/dist/index.js.map +7 -0
- package/package.json +5 -5
- package/dist/capabilities.js +0 -125
- package/dist/debug.js +0 -54
- package/dist/gpu-detection.js +0 -171
- package/dist/gpu-id.js +0 -48
- package/dist/llama-binary.js +0 -287
- package/dist/llama-process.js +0 -203
- package/dist/llama.js +0 -207
- package/dist/model-downloader.js +0 -145
- package/dist/model-resolver.js +0 -80
- package/dist/policy-validator.js +0 -242
- package/dist/types.js +0 -4
- package/dist/websocket.js +0 -433
package/dist/llama-binary.js
DELETED
|
@@ -1,287 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* llama.cpp Binary Downloader
|
|
3
|
-
*
|
|
4
|
-
* Downloads and manages llama.cpp server binary from GitHub releases.
|
|
5
|
-
*
|
|
6
|
-
* Features:
|
|
7
|
-
* - Downloads fixed version for compatibility
|
|
8
|
-
* - Platform-specific binaries with GPU acceleration
|
|
9
|
-
* - Extracts binaries from archives
|
|
10
|
-
* - Caches binary in ~/.auxot/llama-server/{platform}-{arch}/
|
|
11
|
-
* - Works offline after first download
|
|
12
|
-
*
|
|
13
|
-
* GPU Support:
|
|
14
|
-
* - macOS: Metal GPU acceleration (built into binaries)
|
|
15
|
-
* - Linux: Vulkan GPU acceleration
|
|
16
|
-
* - Windows: CUDA GPU acceleration (NVIDIA GPUs)
|
|
17
|
-
*/
|
|
18
|
-
import { existsSync, chmodSync, statSync } from 'node:fs';
|
|
19
|
-
import { mkdir, unlink } from 'node:fs/promises';
|
|
20
|
-
import { join } from 'node:path';
|
|
21
|
-
import { homedir } from 'os';
|
|
22
|
-
import { platform, arch } from 'os';
|
|
23
|
-
import { createWriteStream } from 'node:fs';
|
|
24
|
-
import { exec } from 'child_process';
|
|
25
|
-
import { promisify } from 'util';
|
|
26
|
-
import { detectGpuBackend } from './gpu-detection.js';
|
|
27
|
-
const execAsync = promisify(exec);
|
|
28
|
-
// Pin llama.cpp version for compatibility
|
|
29
|
-
// Update this when we want to upgrade to a new version
|
|
30
|
-
const LLAMA_CPP_VERSION = 'b7716'; // Latest stable as of implementation (confirmed via GitHub API)
|
|
31
|
-
const LLAMA_CPP_REPO = 'ggml-org/llama.cpp'; // Repository is ggml-org, not ggerganov
|
|
32
|
-
/**
|
|
33
|
-
* Get platform-specific archive name based on detected GPU hardware
|
|
34
|
-
*
|
|
35
|
-
* GPU variants:
|
|
36
|
-
* - macOS: Metal GPU acceleration (built into standard binaries)
|
|
37
|
-
* - Linux: Vulkan GPU acceleration (AMD/NVIDIA) or CPU fallback
|
|
38
|
-
* - Windows: CUDA 12.4 GPU acceleration (NVIDIA) or CPU fallback
|
|
39
|
-
*/
|
|
40
|
-
async function getArchiveName() {
|
|
41
|
-
const os = platform();
|
|
42
|
-
const architecture = arch();
|
|
43
|
-
const gpuDetection = await detectGpuBackend();
|
|
44
|
-
// Log warning if CPU fallback is used
|
|
45
|
-
if (gpuDetection.warning) {
|
|
46
|
-
console.warn(` ⚠ ${gpuDetection.warning}`);
|
|
47
|
-
}
|
|
48
|
-
// Map Node.js platform/arch to llama.cpp archive names
|
|
49
|
-
if (os === 'darwin') {
|
|
50
|
-
// macOS: Metal GPU acceleration is built into the binaries
|
|
51
|
-
if (architecture === 'arm64') {
|
|
52
|
-
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-macos-arm64.tar.gz`, warning: gpuDetection.warning };
|
|
53
|
-
}
|
|
54
|
-
else {
|
|
55
|
-
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-macos-x64.tar.gz`, warning: gpuDetection.warning };
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
else if (os === 'linux') {
|
|
59
|
-
if (architecture === 'arm64' || architecture === 'aarch64') {
|
|
60
|
-
throw new Error(`Linux ARM64 binaries not available - please use x64 or build from source`);
|
|
61
|
-
}
|
|
62
|
-
else if (architecture === 's390x') {
|
|
63
|
-
// s390x only has CPU variant
|
|
64
|
-
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-ubuntu-s390x.tar.gz`, warning: gpuDetection.warning };
|
|
65
|
-
}
|
|
66
|
-
else {
|
|
67
|
-
// Linux: Use Vulkan for GPU acceleration, CPU as fallback
|
|
68
|
-
if (gpuDetection.backend === 'cpu') {
|
|
69
|
-
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-ubuntu-x64.tar.gz`, warning: gpuDetection.warning };
|
|
70
|
-
}
|
|
71
|
-
else {
|
|
72
|
-
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-ubuntu-vulkan-x64.tar.gz`, warning: gpuDetection.warning };
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
else if (os === 'win32') {
|
|
77
|
-
if (architecture === 'arm64') {
|
|
78
|
-
// Windows ARM64 only has CPU variant
|
|
79
|
-
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-win-cpu-arm64.zip`, warning: gpuDetection.warning };
|
|
80
|
-
}
|
|
81
|
-
else {
|
|
82
|
-
// Windows: Use CUDA for GPU acceleration (NVIDIA), CPU as fallback
|
|
83
|
-
if (gpuDetection.backend === 'cpu') {
|
|
84
|
-
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-win-cpu-x64.zip`, warning: gpuDetection.warning };
|
|
85
|
-
}
|
|
86
|
-
else {
|
|
87
|
-
return { archiveName: `llama-${LLAMA_CPP_VERSION}-bin-win-cuda-12.4-x64.zip`, warning: gpuDetection.warning };
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
throw new Error(`Unsupported platform: ${os} ${architecture}`);
|
|
92
|
-
}
|
|
93
|
-
/**
|
|
94
|
-
* Get cache directory for llama.cpp binary
|
|
95
|
-
*/
|
|
96
|
-
function getCacheDir() {
|
|
97
|
-
const os = platform();
|
|
98
|
-
const architecture = arch();
|
|
99
|
-
const cacheDir = process.env.AUXOT_LLAMA_CACHE_DIR || join(homedir(), '.auxot', 'llama-server');
|
|
100
|
-
return join(cacheDir, `${os}-${architecture}`);
|
|
101
|
-
}
|
|
102
|
-
/**
|
|
103
|
-
* Get cached binary path
|
|
104
|
-
* Binary is extracted to: {cacheDir}/llama-{VERSION}/llama-server
|
|
105
|
-
*/
|
|
106
|
-
function getBinaryPath() {
|
|
107
|
-
const cacheDir = getCacheDir();
|
|
108
|
-
return join(cacheDir, `llama-${LLAMA_CPP_VERSION}`, 'llama-server');
|
|
109
|
-
}
|
|
110
|
-
/**
|
|
111
|
-
* Download and extract llama.cpp binary from GitHub releases
|
|
112
|
-
*/
|
|
113
|
-
export async function downloadLlamaBinary(onProgress) {
|
|
114
|
-
const { archiveName, warning } = await getArchiveName();
|
|
115
|
-
const binaryPath = getBinaryPath();
|
|
116
|
-
const cacheDir = getCacheDir();
|
|
117
|
-
const archivePath = join(cacheDir, archiveName);
|
|
118
|
-
// Log warning if CPU fallback is used
|
|
119
|
-
if (warning) {
|
|
120
|
-
console.warn(` ⚠ ${warning}`);
|
|
121
|
-
}
|
|
122
|
-
// Check if binary already exists
|
|
123
|
-
if (existsSync(binaryPath)) {
|
|
124
|
-
const stats = statSync(binaryPath);
|
|
125
|
-
if (stats.size > 0) {
|
|
126
|
-
console.log(` ✓ llama.cpp binary already cached (${formatBytes(stats.size)})`);
|
|
127
|
-
return binaryPath;
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
// Create cache directory if it doesn't exist
|
|
131
|
-
if (!existsSync(cacheDir)) {
|
|
132
|
-
await mkdir(cacheDir, { recursive: true });
|
|
133
|
-
}
|
|
134
|
-
// Build GitHub release download URL
|
|
135
|
-
// Format: https://github.com/{repo}/releases/download/{tag}/{asset}
|
|
136
|
-
const downloadUrl = `https://github.com/${LLAMA_CPP_REPO}/releases/download/${LLAMA_CPP_VERSION}/${archiveName}`;
|
|
137
|
-
console.log(` Downloading llama.cpp binary...`);
|
|
138
|
-
console.log(` Version: ${LLAMA_CPP_VERSION}`);
|
|
139
|
-
console.log(` Platform: ${platform()}-${arch()}`);
|
|
140
|
-
console.log(` Archive: ${archiveName}`);
|
|
141
|
-
// Download archive
|
|
142
|
-
const response = await fetch(downloadUrl);
|
|
143
|
-
if (!response.ok) {
|
|
144
|
-
if (response.status === 404) {
|
|
145
|
-
throw new Error(`Archive not found for platform ${platform()}-${arch()}. Available binaries may not exist for this platform.`);
|
|
146
|
-
}
|
|
147
|
-
throw new Error(`Download failed: ${response.status} ${response.statusText}`);
|
|
148
|
-
}
|
|
149
|
-
const contentLength = response.headers.get('content-length');
|
|
150
|
-
const totalSize = contentLength ? parseInt(contentLength, 10) : 0;
|
|
151
|
-
// Stream response to file
|
|
152
|
-
const reader = response.body?.getReader();
|
|
153
|
-
if (!reader) {
|
|
154
|
-
throw new Error('Response body is not readable');
|
|
155
|
-
}
|
|
156
|
-
const fileStream = createWriteStream(archivePath);
|
|
157
|
-
let downloadedBytes = 0;
|
|
158
|
-
try {
|
|
159
|
-
// Download archive
|
|
160
|
-
while (true) {
|
|
161
|
-
const { done, value } = await reader.read();
|
|
162
|
-
if (done) {
|
|
163
|
-
break;
|
|
164
|
-
}
|
|
165
|
-
fileStream.write(value);
|
|
166
|
-
downloadedBytes += value.length;
|
|
167
|
-
// Report progress
|
|
168
|
-
if (onProgress) {
|
|
169
|
-
onProgress(downloadedBytes, totalSize);
|
|
170
|
-
}
|
|
171
|
-
else if (totalSize > 0) {
|
|
172
|
-
// Simple progress log every 1MB
|
|
173
|
-
if (downloadedBytes % (1024 * 1024) < value.length) {
|
|
174
|
-
const percent = ((downloadedBytes / totalSize) * 100).toFixed(1);
|
|
175
|
-
process.stdout.write(`\r Progress: ${percent}% (${formatBytes(downloadedBytes)} / ${formatBytes(totalSize)})`);
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
}
|
|
179
|
-
fileStream.end();
|
|
180
|
-
// Wait for file stream to finish
|
|
181
|
-
await new Promise((resolve, reject) => {
|
|
182
|
-
fileStream.on('finish', resolve);
|
|
183
|
-
fileStream.on('error', reject);
|
|
184
|
-
});
|
|
185
|
-
if (totalSize > 0 && downloadedBytes !== totalSize) {
|
|
186
|
-
throw new Error(`Download incomplete: ${downloadedBytes} bytes downloaded, expected ${totalSize}`);
|
|
187
|
-
}
|
|
188
|
-
if (onProgress) {
|
|
189
|
-
process.stdout.write('\r');
|
|
190
|
-
}
|
|
191
|
-
else {
|
|
192
|
-
process.stdout.write('\r');
|
|
193
|
-
}
|
|
194
|
-
console.log(` ✓ Archive downloaded (${formatBytes(downloadedBytes)})`);
|
|
195
|
-
console.log(` Extracting archive...`);
|
|
196
|
-
// Extract archive
|
|
197
|
-
const os = platform();
|
|
198
|
-
const extractDir = cacheDir;
|
|
199
|
-
try {
|
|
200
|
-
if (archiveName.endsWith('.tar.gz')) {
|
|
201
|
-
// Extract tar.gz (macOS/Linux)
|
|
202
|
-
await execAsync(`tar -xzf "${archivePath}" -C "${extractDir}"`, {
|
|
203
|
-
maxBuffer: 10 * 1024 * 1024, // 10MB buffer
|
|
204
|
-
});
|
|
205
|
-
}
|
|
206
|
-
else if (archiveName.endsWith('.zip')) {
|
|
207
|
-
// Extract zip (Windows)
|
|
208
|
-
// Use PowerShell Expand-Archive on Windows, or unzip on Unix-like systems (Git Bash, WSL)
|
|
209
|
-
if (os === 'win32') {
|
|
210
|
-
// PowerShell Expand-Archive (native Windows)
|
|
211
|
-
await execAsync(`powershell -Command "Expand-Archive -Path '${archivePath}' -DestinationPath '${extractDir}' -Force"`, {
|
|
212
|
-
maxBuffer: 10 * 1024 * 1024, // 10MB buffer
|
|
213
|
-
});
|
|
214
|
-
}
|
|
215
|
-
else {
|
|
216
|
-
// Use unzip (available on Unix-like systems, Git Bash, WSL)
|
|
217
|
-
await execAsync(`unzip -q "${archivePath}" -d "${extractDir}"`, {
|
|
218
|
-
maxBuffer: 10 * 1024 * 1024, // 10MB buffer
|
|
219
|
-
});
|
|
220
|
-
}
|
|
221
|
-
}
|
|
222
|
-
else {
|
|
223
|
-
throw new Error(`Unsupported archive format: ${archiveName}`);
|
|
224
|
-
}
|
|
225
|
-
// Clean up archive
|
|
226
|
-
await unlink(archivePath);
|
|
227
|
-
// Verify binary exists
|
|
228
|
-
if (!existsSync(binaryPath)) {
|
|
229
|
-
throw new Error(`Binary not found after extraction: ${binaryPath}`);
|
|
230
|
-
}
|
|
231
|
-
// Make binary executable (Unix-like systems)
|
|
232
|
-
if (os !== 'win32') {
|
|
233
|
-
chmodSync(binaryPath, 0o755);
|
|
234
|
-
}
|
|
235
|
-
console.log(` ✓ Extraction complete`);
|
|
236
|
-
console.log(` ✓ Binary cached at: ${binaryPath}`);
|
|
237
|
-
return binaryPath;
|
|
238
|
-
}
|
|
239
|
-
catch (extractError) {
|
|
240
|
-
// Clean up archive on error
|
|
241
|
-
if (existsSync(archivePath)) {
|
|
242
|
-
await unlink(archivePath).catch(() => { });
|
|
243
|
-
}
|
|
244
|
-
throw new Error(`Extraction failed: ${extractError instanceof Error ? extractError.message : String(extractError)}`);
|
|
245
|
-
}
|
|
246
|
-
}
|
|
247
|
-
catch (error) {
|
|
248
|
-
fileStream.destroy();
|
|
249
|
-
// Clean up partial download
|
|
250
|
-
if (existsSync(archivePath)) {
|
|
251
|
-
await unlink(archivePath).catch(() => { });
|
|
252
|
-
}
|
|
253
|
-
throw error;
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
/**
|
|
257
|
-
* Get llama.cpp binary path, downloading if necessary
|
|
258
|
-
*/
|
|
259
|
-
export async function ensureLlamaBinary(onProgress) {
|
|
260
|
-
const binaryPath = getBinaryPath();
|
|
261
|
-
// Check if binary exists and is valid
|
|
262
|
-
if (existsSync(binaryPath)) {
|
|
263
|
-
const stats = statSync(binaryPath);
|
|
264
|
-
if (stats.size > 0) {
|
|
265
|
-
return binaryPath;
|
|
266
|
-
}
|
|
267
|
-
}
|
|
268
|
-
// Download and extract binary
|
|
269
|
-
return await downloadLlamaBinary(onProgress);
|
|
270
|
-
}
|
|
271
|
-
/**
|
|
272
|
-
* Get binary path without downloading (for checking if binary exists)
|
|
273
|
-
*/
|
|
274
|
-
export function getLlamaBinaryPath() {
|
|
275
|
-
return getBinaryPath();
|
|
276
|
-
}
|
|
277
|
-
/**
|
|
278
|
-
* Format bytes to human-readable string
|
|
279
|
-
*/
|
|
280
|
-
function formatBytes(bytes) {
|
|
281
|
-
if (bytes === 0)
|
|
282
|
-
return '0 B';
|
|
283
|
-
const k = 1024;
|
|
284
|
-
const sizes = ['B', 'KB', 'MB', 'GB'];
|
|
285
|
-
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
|
286
|
-
return `${(bytes / Math.pow(k, i)).toFixed(1)} ${sizes[i]}`;
|
|
287
|
-
}
|
package/dist/llama-process.js
DELETED
|
@@ -1,203 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* llama.cpp Process Management
|
|
3
|
-
*
|
|
4
|
-
* Spawns and manages llama.cpp server process with parameters from GPU key policy.
|
|
5
|
-
*
|
|
6
|
-
* Handles:
|
|
7
|
-
* - Process lifecycle (start, stop, restart)
|
|
8
|
-
* - Crash detection and restart
|
|
9
|
-
* - stdout/stderr capture for debugging
|
|
10
|
-
* - Graceful shutdown
|
|
11
|
-
*/
|
|
12
|
-
import { spawn } from 'child_process';
|
|
13
|
-
import { ensureLlamaBinary } from './llama-binary.js';
|
|
14
|
-
/**
|
|
15
|
-
* Spawn llama.cpp server process with policy parameters
|
|
16
|
-
*/
|
|
17
|
-
export async function spawnLlamaCpp(options) {
|
|
18
|
-
// Ensure llama.cpp binary is downloaded
|
|
19
|
-
const binaryPath = options.binaryPath || await ensureLlamaBinary();
|
|
20
|
-
const { modelPath, contextSize, parallelism, port = 9002, host = '127.0.0.1', gpuLayers, } = options;
|
|
21
|
-
// Store original options for restart
|
|
22
|
-
const originalOptions = {
|
|
23
|
-
binaryPath,
|
|
24
|
-
modelPath,
|
|
25
|
-
contextSize,
|
|
26
|
-
parallelism,
|
|
27
|
-
port,
|
|
28
|
-
host,
|
|
29
|
-
gpuLayers,
|
|
30
|
-
};
|
|
31
|
-
// Build command arguments
|
|
32
|
-
const args = [
|
|
33
|
-
'--model', modelPath,
|
|
34
|
-
'--ctx-size', contextSize.toString(),
|
|
35
|
-
'--parallel', parallelism.toString(),
|
|
36
|
-
'--port', port.toString(),
|
|
37
|
-
'--host', host,
|
|
38
|
-
'--batch-size', '512',
|
|
39
|
-
'--threads', '12',
|
|
40
|
-
'--jinja', // Enable Jinja templating for tool calling
|
|
41
|
-
];
|
|
42
|
-
// Add GPU layers if specified
|
|
43
|
-
if (gpuLayers !== undefined) {
|
|
44
|
-
args.push('--n-gpu-layers', gpuLayers.toString());
|
|
45
|
-
}
|
|
46
|
-
console.log(`[llama.cpp] Spawning process: ${binaryPath} ${args.join(' ')}`);
|
|
47
|
-
// Spawn process
|
|
48
|
-
const childProcess = spawn(binaryPath, args, {
|
|
49
|
-
stdio: ['ignore', 'pipe', 'pipe'], // stdin: ignore, stdout/stderr: pipe
|
|
50
|
-
env: process.env,
|
|
51
|
-
});
|
|
52
|
-
let isRunning = true;
|
|
53
|
-
const crashCallbacks = [];
|
|
54
|
-
// Capture stdout for debugging (with buffering for better formatting)
|
|
55
|
-
let stdoutBuffer = '';
|
|
56
|
-
childProcess.stdout?.on('data', (data) => {
|
|
57
|
-
stdoutBuffer += data.toString();
|
|
58
|
-
// Process complete lines
|
|
59
|
-
const lines = stdoutBuffer.split('\n');
|
|
60
|
-
stdoutBuffer = lines.pop() || ''; // Keep incomplete line in buffer
|
|
61
|
-
for (const line of lines) {
|
|
62
|
-
const trimmed = line.trim();
|
|
63
|
-
if (trimmed) {
|
|
64
|
-
console.log(`[llama.cpp stdout] ${trimmed}`);
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
});
|
|
68
|
-
// Capture stderr for debugging (with buffering for better formatting)
|
|
69
|
-
let stderrBuffer = '';
|
|
70
|
-
childProcess.stderr?.on('data', (data) => {
|
|
71
|
-
stderrBuffer += data.toString();
|
|
72
|
-
// Process complete lines
|
|
73
|
-
const lines = stderrBuffer.split('\n');
|
|
74
|
-
stderrBuffer = lines.pop() || ''; // Keep incomplete line in buffer
|
|
75
|
-
for (const line of lines) {
|
|
76
|
-
const trimmed = line.trim();
|
|
77
|
-
if (trimmed) {
|
|
78
|
-
console.error(`[llama.cpp stderr] ${trimmed}`);
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
});
|
|
82
|
-
// Handle process exit
|
|
83
|
-
childProcess.on('exit', (code, signal) => {
|
|
84
|
-
isRunning = false;
|
|
85
|
-
// Flush any remaining buffered output
|
|
86
|
-
if (stdoutBuffer.trim()) {
|
|
87
|
-
console.log(`[llama.cpp stdout] ${stdoutBuffer.trim()}`);
|
|
88
|
-
stdoutBuffer = '';
|
|
89
|
-
}
|
|
90
|
-
if (stderrBuffer.trim()) {
|
|
91
|
-
console.error(`[llama.cpp stderr] ${stderrBuffer.trim()}`);
|
|
92
|
-
stderrBuffer = '';
|
|
93
|
-
}
|
|
94
|
-
if (code !== null) {
|
|
95
|
-
console.log(`[llama.cpp] Process exited with code ${code}`);
|
|
96
|
-
// Notify crash callbacks (non-zero exit is a crash)
|
|
97
|
-
if (code !== 0) {
|
|
98
|
-
for (const callback of crashCallbacks) {
|
|
99
|
-
callback(code, signal);
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
else if (signal) {
|
|
104
|
-
console.log(`[llama.cpp] Process killed with signal ${signal}`);
|
|
105
|
-
// Notify crash callbacks (killed by signal is a crash unless it's SIGTERM/SIGINT)
|
|
106
|
-
if (signal !== 'SIGTERM' && signal !== 'SIGINT') {
|
|
107
|
-
for (const callback of crashCallbacks) {
|
|
108
|
-
callback(null, signal);
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
});
|
|
113
|
-
// Handle process errors
|
|
114
|
-
childProcess.on('error', (error) => {
|
|
115
|
-
console.error(`[llama.cpp] Process error:`, error);
|
|
116
|
-
isRunning = false;
|
|
117
|
-
// Notify crash callbacks
|
|
118
|
-
for (const callback of crashCallbacks) {
|
|
119
|
-
callback(null, null);
|
|
120
|
-
}
|
|
121
|
-
});
|
|
122
|
-
return {
|
|
123
|
-
process: childProcess,
|
|
124
|
-
get isRunning() {
|
|
125
|
-
return isRunning && childProcess.exitCode === null;
|
|
126
|
-
},
|
|
127
|
-
async stop() {
|
|
128
|
-
if (!isRunning) {
|
|
129
|
-
return;
|
|
130
|
-
}
|
|
131
|
-
console.log('[llama.cpp] Stopping process...');
|
|
132
|
-
isRunning = false;
|
|
133
|
-
// Try graceful shutdown first (SIGTERM)
|
|
134
|
-
childProcess.kill('SIGTERM');
|
|
135
|
-
// Wait up to 5 seconds for graceful shutdown
|
|
136
|
-
await new Promise((resolve) => {
|
|
137
|
-
const timeout = setTimeout(() => {
|
|
138
|
-
// Force kill if still running
|
|
139
|
-
if (childProcess.exitCode === null) {
|
|
140
|
-
console.warn('[llama.cpp] Process did not exit gracefully, force killing...');
|
|
141
|
-
childProcess.kill('SIGKILL');
|
|
142
|
-
}
|
|
143
|
-
resolve();
|
|
144
|
-
}, 5000);
|
|
145
|
-
childProcess.once('exit', () => {
|
|
146
|
-
clearTimeout(timeout);
|
|
147
|
-
resolve();
|
|
148
|
-
});
|
|
149
|
-
});
|
|
150
|
-
},
|
|
151
|
-
async restart() {
|
|
152
|
-
console.log('[llama.cpp] Restarting process...');
|
|
153
|
-
// Stop current process
|
|
154
|
-
const stopPromise = (async () => {
|
|
155
|
-
if (childProcess && isRunning) {
|
|
156
|
-
childProcess.kill('SIGTERM');
|
|
157
|
-
// Wait for graceful shutdown
|
|
158
|
-
await new Promise((resolve) => {
|
|
159
|
-
const timeout = setTimeout(() => {
|
|
160
|
-
if (childProcess && isRunning) {
|
|
161
|
-
console.warn('[llama.cpp] Process did not exit gracefully, force killing...');
|
|
162
|
-
childProcess.kill('SIGKILL');
|
|
163
|
-
}
|
|
164
|
-
resolve();
|
|
165
|
-
}, 5000);
|
|
166
|
-
childProcess.once('exit', () => {
|
|
167
|
-
clearTimeout(timeout);
|
|
168
|
-
resolve();
|
|
169
|
-
});
|
|
170
|
-
});
|
|
171
|
-
}
|
|
172
|
-
})();
|
|
173
|
-
await stopPromise;
|
|
174
|
-
isRunning = false;
|
|
175
|
-
// Create a new process with the same options
|
|
176
|
-
return await spawnLlamaCpp(originalOptions);
|
|
177
|
-
},
|
|
178
|
-
onCrash(callback) {
|
|
179
|
-
crashCallbacks.push(callback);
|
|
180
|
-
},
|
|
181
|
-
};
|
|
182
|
-
}
|
|
183
|
-
/**
|
|
184
|
-
* Wait for llama.cpp server to be ready (health check)
|
|
185
|
-
*/
|
|
186
|
-
export async function waitForLlamaReady(url, timeoutMs = 30000) {
|
|
187
|
-
const startTime = Date.now();
|
|
188
|
-
const checkInterval = 500; // Check every 500ms
|
|
189
|
-
while (Date.now() - startTime < timeoutMs) {
|
|
190
|
-
try {
|
|
191
|
-
const response = await fetch(`${url}/v1/models`);
|
|
192
|
-
if (response.ok) {
|
|
193
|
-
console.log('[llama.cpp] Server is ready');
|
|
194
|
-
return;
|
|
195
|
-
}
|
|
196
|
-
}
|
|
197
|
-
catch (error) {
|
|
198
|
-
// Server not ready yet, continue waiting
|
|
199
|
-
}
|
|
200
|
-
await new Promise((resolve) => setTimeout(resolve, checkInterval));
|
|
201
|
-
}
|
|
202
|
-
throw new Error(`llama.cpp server did not become ready within ${timeoutMs}ms`);
|
|
203
|
-
}
|
package/dist/llama.js
DELETED
|
@@ -1,207 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* llama.cpp Integration
|
|
3
|
-
*
|
|
4
|
-
* Simple dumb router: forwards OpenAI-format jobs to llama.cpp and streams back tokens.
|
|
5
|
-
* NO prompt templates or business logic - server controls all prompts.
|
|
6
|
-
*/
|
|
7
|
-
import { createParser } from 'eventsource-parser';
|
|
8
|
-
import { logClientToLlama, logLlamaToClient } from './debug.js';
|
|
9
|
-
/**
|
|
10
|
-
* Process job by forwarding to llama.cpp
|
|
11
|
-
*
|
|
12
|
-
* Dumb router - receives OpenAI format, forwards to llama.cpp, streams back tokens.
|
|
13
|
-
*
|
|
14
|
-
* If job.max_tokens is not provided, uses capabilities.max_tokens_default to override
|
|
15
|
-
* llama.cpp's OpenAI API layer default (which is 2048, too restrictive).
|
|
16
|
-
*
|
|
17
|
-
* Supports cancellation via AbortSignal.
|
|
18
|
-
*/
|
|
19
|
-
export async function processJob(job, llamaUrl, capabilities, abortSignal, onToken) {
|
|
20
|
-
const startTime = Date.now();
|
|
21
|
-
// Prepare request to llama.cpp (OpenAI compatible format)
|
|
22
|
-
const request = {
|
|
23
|
-
model: 'default',
|
|
24
|
-
messages: job.messages,
|
|
25
|
-
temperature: job.temperature ?? 0.7,
|
|
26
|
-
stream: true,
|
|
27
|
-
};
|
|
28
|
-
// Include tools if provided (for function calling)
|
|
29
|
-
if (job.tools && job.tools.length > 0) {
|
|
30
|
-
request.tools = job.tools;
|
|
31
|
-
}
|
|
32
|
-
// Handle max_tokens:
|
|
33
|
-
// - If job specifies max_tokens, use it (explicit limit)
|
|
34
|
-
// - Otherwise, use capabilities.max_tokens_default from /props (server's default, often -1 for unlimited)
|
|
35
|
-
// - This overrides llama.cpp's OpenAI API compat layer default of 2048
|
|
36
|
-
if (job.max_tokens !== undefined) {
|
|
37
|
-
request.max_tokens = job.max_tokens;
|
|
38
|
-
}
|
|
39
|
-
else if (capabilities.max_tokens_default !== undefined) {
|
|
40
|
-
request.max_tokens = capabilities.max_tokens_default;
|
|
41
|
-
}
|
|
42
|
-
// Debug log request to llama.cpp (level 2)
|
|
43
|
-
logClientToLlama(request);
|
|
44
|
-
const requestBody = JSON.stringify(request);
|
|
45
|
-
const response = await fetch(`${llamaUrl}/v1/chat/completions`, {
|
|
46
|
-
method: 'POST',
|
|
47
|
-
headers: {
|
|
48
|
-
'Content-Type': 'application/json',
|
|
49
|
-
},
|
|
50
|
-
body: requestBody,
|
|
51
|
-
signal: abortSignal, // Pass abort signal for cancellation
|
|
52
|
-
});
|
|
53
|
-
if (!response.ok) {
|
|
54
|
-
// Try to get error details from response body
|
|
55
|
-
let errorDetails = '';
|
|
56
|
-
try {
|
|
57
|
-
const errorBody = await response.text();
|
|
58
|
-
errorDetails = errorBody ? ` - ${errorBody.substring(0, 500)}` : '';
|
|
59
|
-
}
|
|
60
|
-
catch {
|
|
61
|
-
// Ignore if we can't read error body
|
|
62
|
-
}
|
|
63
|
-
console.error(`[llama.cpp] Request failed:`, {
|
|
64
|
-
status: response.status,
|
|
65
|
-
statusText: response.statusText,
|
|
66
|
-
request: JSON.stringify(request, null, 2),
|
|
67
|
-
errorBody: errorDetails,
|
|
68
|
-
});
|
|
69
|
-
throw new Error(`llama.cpp request failed: ${response.status} ${response.statusText}${errorDetails}`);
|
|
70
|
-
}
|
|
71
|
-
if (!response.body) {
|
|
72
|
-
throw new Error('No response body from llama.cpp');
|
|
73
|
-
}
|
|
74
|
-
// Process stream
|
|
75
|
-
let fullResponse = '';
|
|
76
|
-
let timings = undefined;
|
|
77
|
-
const reader = response.body.getReader();
|
|
78
|
-
const decoder = new TextDecoder();
|
|
79
|
-
// Track tool calls as they stream in
|
|
80
|
-
const toolCallsMap = new Map();
|
|
81
|
-
// Create SSE parser
|
|
82
|
-
const parser = createParser((event) => {
|
|
83
|
-
// Skip reconnection interval messages
|
|
84
|
-
if (event.type === 'reconnect-interval')
|
|
85
|
-
return;
|
|
86
|
-
// Check for [DONE] signal
|
|
87
|
-
if (event.data === '[DONE]')
|
|
88
|
-
return;
|
|
89
|
-
try {
|
|
90
|
-
const chunk = JSON.parse(event.data);
|
|
91
|
-
// Capture timings from the final chunk
|
|
92
|
-
if (chunk.timings) {
|
|
93
|
-
timings = chunk.timings;
|
|
94
|
-
}
|
|
95
|
-
// Extract content
|
|
96
|
-
const content = chunk.choices[0]?.delta?.content;
|
|
97
|
-
if (content) {
|
|
98
|
-
fullResponse += content;
|
|
99
|
-
onToken(content);
|
|
100
|
-
}
|
|
101
|
-
// Extract tool_calls (streamed incrementally)
|
|
102
|
-
const toolCalls = chunk.choices[0]?.delta?.tool_calls;
|
|
103
|
-
if (toolCalls) {
|
|
104
|
-
for (const tc of toolCalls) {
|
|
105
|
-
const index = tc.index ?? 0;
|
|
106
|
-
// Get or create tool call entry
|
|
107
|
-
if (!toolCallsMap.has(index)) {
|
|
108
|
-
toolCallsMap.set(index, {
|
|
109
|
-
id: tc.id,
|
|
110
|
-
type: 'function',
|
|
111
|
-
function: {
|
|
112
|
-
name: tc.function?.name,
|
|
113
|
-
arguments: tc.function?.arguments || '',
|
|
114
|
-
},
|
|
115
|
-
});
|
|
116
|
-
}
|
|
117
|
-
else {
|
|
118
|
-
// Append to existing tool call (arguments stream incrementally)
|
|
119
|
-
const existing = toolCallsMap.get(index);
|
|
120
|
-
if (tc.id)
|
|
121
|
-
existing.id = tc.id;
|
|
122
|
-
if (tc.function?.name)
|
|
123
|
-
existing.function.name = tc.function.name;
|
|
124
|
-
if (tc.function?.arguments) {
|
|
125
|
-
existing.function.arguments = (existing.function.arguments || '') + tc.function.arguments;
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
catch (error) {
|
|
132
|
-
console.error('[SSE Parser] Error parsing chunk:', error);
|
|
133
|
-
}
|
|
134
|
-
});
|
|
135
|
-
try {
|
|
136
|
-
while (true) {
|
|
137
|
-
// Check for abort before reading next chunk
|
|
138
|
-
if (abortSignal.aborted) {
|
|
139
|
-
break; // Exit gracefully with partial response
|
|
140
|
-
}
|
|
141
|
-
const { done, value } = await reader.read();
|
|
142
|
-
if (done)
|
|
143
|
-
break;
|
|
144
|
-
// Decode and feed to SSE parser
|
|
145
|
-
const text = decoder.decode(value, { stream: true });
|
|
146
|
-
// Debug log raw SSE chunk (level 2)
|
|
147
|
-
logLlamaToClient(text);
|
|
148
|
-
parser.feed(text);
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
catch (error) {
|
|
152
|
-
// If aborted, return partial response gracefully
|
|
153
|
-
if (error instanceof Error && error.name === 'AbortError') {
|
|
154
|
-
console.log(`[Job ${job.job_id}] Cancelled - returning partial response`);
|
|
155
|
-
// Don't throw - return what we have so far
|
|
156
|
-
}
|
|
157
|
-
else {
|
|
158
|
-
throw error; // Re-throw non-abort errors
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
finally {
|
|
162
|
-
reader.releaseLock();
|
|
163
|
-
}
|
|
164
|
-
// Use real timing data from llama.cpp if available
|
|
165
|
-
let durationMs;
|
|
166
|
-
let inputTokens;
|
|
167
|
-
let outputTokens;
|
|
168
|
-
if (timings) {
|
|
169
|
-
// TypeScript has narrowing issues with optional types in closures, so we explicitly type
|
|
170
|
-
const t = timings;
|
|
171
|
-
durationMs = Math.round(t.prompt_ms + t.predicted_ms);
|
|
172
|
-
inputTokens = t.prompt_n;
|
|
173
|
-
outputTokens = t.predicted_n;
|
|
174
|
-
console.log(`[Job ${job.job_id}] Completed in ${durationMs}ms (${inputTokens} prompt, ${outputTokens} predicted)`);
|
|
175
|
-
}
|
|
176
|
-
else {
|
|
177
|
-
// Fallback to estimates if no timing data
|
|
178
|
-
console.warn(`[Job ${job.job_id}] No timing data from llama.cpp`);
|
|
179
|
-
durationMs = Date.now() - startTime;
|
|
180
|
-
inputTokens = Math.ceil(JSON.stringify(job.messages).length / 4);
|
|
181
|
-
outputTokens = Math.ceil(fullResponse.length / 4);
|
|
182
|
-
}
|
|
183
|
-
// Convert tool calls map to array (if any)
|
|
184
|
-
const finalToolCalls = toolCallsMap.size > 0
|
|
185
|
-
? Array.from(toolCallsMap.values())
|
|
186
|
-
.filter(tc => tc.id && tc.function.name) // Only include complete tool calls
|
|
187
|
-
.map(tc => ({
|
|
188
|
-
id: tc.id,
|
|
189
|
-
type: 'function',
|
|
190
|
-
function: {
|
|
191
|
-
name: tc.function.name,
|
|
192
|
-
arguments: tc.function.arguments || '{}',
|
|
193
|
-
},
|
|
194
|
-
}))
|
|
195
|
-
: undefined;
|
|
196
|
-
// Log tool calls if present
|
|
197
|
-
if (finalToolCalls && finalToolCalls.length > 0) {
|
|
198
|
-
console.log(`[Job ${job.job_id}] Detected ${finalToolCalls.length} tool call(s):`, finalToolCalls.map(tc => tc.function.name).join(', '));
|
|
199
|
-
}
|
|
200
|
-
return {
|
|
201
|
-
fullResponse,
|
|
202
|
-
durationMs,
|
|
203
|
-
inputTokens,
|
|
204
|
-
outputTokens,
|
|
205
|
-
tool_calls: finalToolCalls,
|
|
206
|
-
};
|
|
207
|
-
}
|