@auxot/worker-cli 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@auxot/worker-cli",
3
- "version": "0.1.2",
3
+ "version": "0.1.4",
4
4
  "type": "module",
5
5
  "description": "Auxot GPU worker CLI - connects local GPU resources to Auxot platform",
6
6
  "license": "UNLICENSED",
@@ -34,20 +34,20 @@
34
34
  "dev": "tsx src/index.ts",
35
35
  "dev:debug": "tsx src/index.ts --debug 1",
36
36
  "dev:debug2": "tsx src/index.ts --debug 2",
37
- "build": "tsc",
37
+ "build": "node build.mjs",
38
38
  "start": "node dist/index.js",
39
39
  "prepublishOnly": "npm run build"
40
40
  },
41
41
  "dependencies": {
42
- "@auxot/model-registry": "*",
43
- "@auxot/shared": "*",
44
42
  "eventsource-parser": "^1.1.2",
45
43
  "ws": "^8.16.0",
46
44
  "uuid": "^9.0.1"
47
45
  },
48
46
  "devDependencies": {
47
+ "@auxot/model-registry": "*",
49
48
  "@types/node": "^20.10.6",
50
49
  "@types/ws": "^8.5.10",
50
+ "esbuild": "^0.20.0",
51
51
  "tsx": "^4.7.0",
52
52
  "typescript": "^5.3.3"
53
53
  },
@@ -1,125 +0,0 @@
1
- /**
2
- * llama.cpp Capability Discovery
3
- *
4
- * Queries llama.cpp OpenAI-compatible API to discover:
5
- * - Available models
6
- * - Context size
7
- * - Model parameters
8
- * - VRAM/GPU info (if available)
9
- */
10
- /**
11
- * Discover llama.cpp capabilities via OpenAI-compatible API
12
- */
13
- /**
14
- * Normalize model name from file path
15
- *
16
- * Examples:
17
- * - "/Users/kminkler/.models/Qwen3-Coder-30B-A3B-Instruct-Q4_K_S.gguf"
18
- * -> "Qwen3-Coder-30B-A3B-Instruct"
19
- * - "llama-2-7b-chat.Q4_K_M.gguf"
20
- * -> "llama-2-7b-chat"
21
- */
22
- function normalizeModelName(filePath) {
23
- // Extract filename from path
24
- const filename = filePath.split('/').pop() || filePath;
25
- // Remove .gguf extension
26
- let name = filename.replace(/\.gguf$/i, '');
27
- // Remove common quantization suffixes (Q4_K_M, Q4_K_S, Q5_K_M, etc.)
28
- name = name.replace(/-?(Q\d+_[KM]_[SM]|Q\d+_[KM]|Q\d+)$/i, '');
29
- // Clean up any trailing hyphens or underscores
30
- name = name.replace(/[-_]+$/, '');
31
- return name;
32
- }
33
- export async function discoverCapabilities(llamaUrl) {
34
- try {
35
- // Query /v1/models endpoint
36
- const modelsResponse = await fetch(`${llamaUrl}/v1/models`);
37
- if (!modelsResponse.ok) {
38
- throw new Error(`Failed to fetch models: ${modelsResponse.status} ${modelsResponse.statusText}`);
39
- }
40
- const modelsData = await modelsResponse.json();
41
- if (!modelsData.data || modelsData.data.length === 0) {
42
- throw new Error('No models found in llama.cpp server');
43
- }
44
- // Use first model (typically only one model loaded)
45
- const model = modelsData.data[0];
46
- // Normalize model name (remove path and quantization suffix)
47
- const modelName = normalizeModelName(model.id);
48
- const capabilities = {
49
- backend: 'llama.cpp',
50
- model: modelName,
51
- ctx_size: 4096, // Default, will try to get actual from /props
52
- };
53
- // Extract parameter count if available
54
- if (model.meta?.n_params) {
55
- const params = model.meta.n_params;
56
- if (params >= 1e9) {
57
- capabilities.parameters = `${Math.round(params / 1e9)}B`;
58
- }
59
- else if (params >= 1e6) {
60
- capabilities.parameters = `${Math.round(params / 1e6)}M`;
61
- }
62
- }
63
- // Try to get runtime context size from /props endpoint
64
- // This gives us the actual --ctx-size value, not n_ctx_train
65
- try {
66
- const propsResponse = await fetch(`${llamaUrl}/props`);
67
- if (propsResponse.ok) {
68
- const props = await propsResponse.json();
69
- // Extract runtime context size (this is the --ctx-size value)
70
- if (props.default_generation_settings?.n_ctx) {
71
- capabilities.ctx_size = props.default_generation_settings.n_ctx;
72
- }
73
- // Extract default max_tokens (for unlimited generation)
74
- // llama.cpp defaults to -1 (unlimited) but OpenAI API compat layer defaults to 2048
75
- // We want to explicitly send the server's max to override the API layer default
76
- // Path is: default_generation_settings.params.n_predict
77
- if (props.default_generation_settings?.params?.n_predict !== undefined) {
78
- capabilities.max_tokens_default = props.default_generation_settings.params.n_predict;
79
- }
80
- // Extract total_slots (parallel job capacity)
81
- if (props.total_slots) {
82
- capabilities.total_slots = props.total_slots;
83
- }
84
- // Extract VRAM info if available
85
- if (props.total_vram_mb) {
86
- capabilities.vram_gb = Math.round(props.total_vram_mb / 1024);
87
- }
88
- // If we got the context size from /props, we're done
89
- if (capabilities.ctx_size !== 4096) {
90
- console.log('Discovered capabilities:', capabilities);
91
- return capabilities;
92
- }
93
- }
94
- }
95
- catch (propsError) {
96
- console.warn('/props endpoint not available, trying /health');
97
- }
98
- // Try /health endpoint as fallback
99
- try {
100
- const healthResponse = await fetch(`${llamaUrl}/health`);
101
- if (healthResponse.ok) {
102
- const health = await healthResponse.json();
103
- // Some versions expose n_ctx in health endpoint
104
- if (health.n_ctx) {
105
- capabilities.ctx_size = health.n_ctx;
106
- console.log(`Runtime context size from /health: ${capabilities.ctx_size}`);
107
- }
108
- }
109
- }
110
- catch {
111
- console.warn('/health endpoint not available');
112
- }
113
- // Last resort: use n_ctx_train as estimate
114
- if (capabilities.ctx_size === 4096 && model.meta?.n_ctx_train) {
115
- console.warn('Could not determine runtime context size, using n_ctx_train as fallback');
116
- capabilities.ctx_size = model.meta.n_ctx_train;
117
- }
118
- console.log('Discovered capabilities:', capabilities);
119
- return capabilities;
120
- }
121
- catch (error) {
122
- console.error('Failed to discover capabilities:', error);
123
- throw error;
124
- }
125
- }
package/dist/debug.js DELETED
@@ -1,54 +0,0 @@
1
- /**
2
- * Debug logging utilities
3
- *
4
- * Controlled by --debug flag with optional level (1 or 2)
5
- * Level 1: WebSocket messages between CLI and server
6
- * Level 2: Level 1 + llama.cpp messages
7
- */
8
- let debugLevel = 0;
9
- export function setDebugLevel(level) {
10
- debugLevel = level;
11
- }
12
- export function getDebugLevel() {
13
- return debugLevel;
14
- }
15
- /**
16
- * Log message from WebSocket server to CLI
17
- */
18
- export function logServerToClient(message) {
19
- if (debugLevel < 1)
20
- return;
21
- console.log('cli <<< server');
22
- console.log(JSON.stringify(message, null, 2));
23
- console.log('');
24
- }
25
- /**
26
- * Log message from CLI to WebSocket server
27
- */
28
- export function logClientToServer(message) {
29
- if (debugLevel < 1)
30
- return;
31
- console.log('cli >>> server');
32
- console.log(JSON.stringify(message, null, 2));
33
- console.log('');
34
- }
35
- /**
36
- * Log request from CLI to llama.cpp
37
- */
38
- export function logClientToLlama(request) {
39
- if (debugLevel < 2)
40
- return;
41
- console.log('llama.cpp << cli');
42
- console.log(JSON.stringify(request, null, 2));
43
- console.log('');
44
- }
45
- /**
46
- * Log response chunk from llama.cpp to CLI
47
- */
48
- export function logLlamaToClient(chunk) {
49
- if (debugLevel < 2)
50
- return;
51
- console.log('llama.cpp >> cli');
52
- console.log(chunk);
53
- console.log('');
54
- }
@@ -1,171 +0,0 @@
1
- /**
2
- * GPU Hardware Detection
3
- *
4
- * Detects available GPU hardware to determine which llama.cpp binary variant to use.
5
- *
6
- * Detection order:
7
- * - Windows: NVIDIA CUDA → CPU (with warning)
8
- * - Linux: NVIDIA CUDA → Vulkan → CPU (with warning)
9
- * - macOS: Metal (built-in) → CPU (with warning, rare)
10
- */
11
- import { exec } from 'child_process';
12
- import { promisify } from 'util';
13
- import { platform } from 'os';
14
- const execAsync = promisify(exec);
15
- /**
16
- * Detect GPU hardware and return appropriate backend
17
- */
18
- export async function detectGpuBackend() {
19
- const os = platform();
20
- if (os === 'darwin') {
21
- return await detectMacOSGpu();
22
- }
23
- else if (os === 'linux') {
24
- return await detectLinuxGpu();
25
- }
26
- else if (os === 'win32') {
27
- return await detectWindowsGpu();
28
- }
29
- // Unknown platform - default to CPU with warning
30
- return {
31
- backend: 'cpu',
32
- detected: false,
33
- warning: `Unknown platform: ${os}. Defaulting to CPU variant.`,
34
- };
35
- }
36
- /**
37
- * Detect GPU on macOS
38
- *
39
- * Apple Silicon (M1/M2/M3) always has Metal GPU.
40
- * Intel Macs may or may not have Metal (depends on GPU model).
41
- */
42
- async function detectMacOSGpu() {
43
- try {
44
- // Check if Metal is available (should be on all modern macOS)
45
- // Metal is built into macOS, so we assume it's available
46
- // The standard binaries include Metal support
47
- return {
48
- backend: 'metal',
49
- detected: true,
50
- };
51
- }
52
- catch (error) {
53
- // Fallback to CPU
54
- return {
55
- backend: 'cpu',
56
- detected: false,
57
- warning: 'Metal GPU not detected. Using CPU variant (performance will be limited).',
58
- };
59
- }
60
- }
61
- /**
62
- * Detect GPU on Linux
63
- *
64
- * Detection order:
65
- * 1. NVIDIA CUDA (nvidia-smi)
66
- * 2. Vulkan (vulkaninfo or lspci)
67
- * 3. CPU (fallback with warning)
68
- */
69
- async function detectLinuxGpu() {
70
- // Try NVIDIA CUDA first
71
- try {
72
- await execAsync('nvidia-smi --query-gpu=name --format=csv,noheader', {
73
- timeout: 5000,
74
- });
75
- // NVIDIA GPU detected
76
- // Note: llama.cpp releases don't include CUDA binaries for Linux
77
- // Users need to build from source for CUDA on Linux
78
- // For now, use Vulkan (works with NVIDIA GPUs too)
79
- return {
80
- backend: 'vulkan',
81
- detected: true,
82
- };
83
- }
84
- catch (error) {
85
- // nvidia-smi not found or failed - continue to Vulkan check
86
- }
87
- // Try Vulkan (works with AMD, NVIDIA, Intel GPUs)
88
- try {
89
- await execAsync('vulkaninfo --summary 2>/dev/null', {
90
- timeout: 5000,
91
- });
92
- // Vulkan GPU detected
93
- return {
94
- backend: 'vulkan',
95
- detected: true,
96
- };
97
- }
98
- catch (error) {
99
- // vulkaninfo not found - try lspci as fallback
100
- try {
101
- const { stdout } = await execAsync('lspci | grep -i vga', {
102
- timeout: 5000,
103
- });
104
- if (stdout && stdout.trim()) {
105
- // Some GPU detected (AMD/NVIDIA/Intel) - try Vulkan anyway
106
- // Vulkan binaries should work even if vulkaninfo isn't installed
107
- return {
108
- backend: 'vulkan',
109
- detected: true,
110
- };
111
- }
112
- }
113
- catch (error) {
114
- // lspci failed or no GPU found
115
- }
116
- }
117
- // No GPU detected - use CPU with warning
118
- return {
119
- backend: 'cpu',
120
- detected: false,
121
- warning: 'No GPU detected. Using CPU variant (performance will be severely limited). Consider using models <= 7B.',
122
- };
123
- }
124
- /**
125
- * Detect GPU on Windows
126
- *
127
- * Detection order:
128
- * 1. NVIDIA CUDA (nvidia-smi)
129
- * 2. CPU (fallback with warning)
130
- */
131
- async function detectWindowsGpu() {
132
- // Try NVIDIA CUDA
133
- try {
134
- await execAsync('nvidia-smi --query-gpu=name --format=csv,noheader', {
135
- timeout: 5000,
136
- shell: 'cmd.exe',
137
- });
138
- // NVIDIA GPU detected
139
- return {
140
- backend: 'cuda',
141
- detected: true,
142
- };
143
- }
144
- catch (error) {
145
- // nvidia-smi not found or failed - no NVIDIA GPU
146
- }
147
- // Try to check for other GPUs via WMI (AMD, Intel)
148
- try {
149
- const { stdout } = await execAsync('wmic path win32_VideoController get name', {
150
- timeout: 5000,
151
- shell: 'cmd.exe',
152
- });
153
- if (stdout && stdout.includes('AMD') || stdout.includes('Radeon') || stdout.includes('NVIDIA')) {
154
- // Some GPU detected but not NVIDIA - use CPU (Windows releases don't have Vulkan binaries easily available)
155
- return {
156
- backend: 'cpu',
157
- detected: false,
158
- warning: 'Non-NVIDIA GPU detected. Using CPU variant (CUDA binaries require NVIDIA GPUs). Consider using models <= 7B.',
159
- };
160
- }
161
- }
162
- catch (error) {
163
- // WMI failed
164
- }
165
- // No GPU detected - use CPU with warning
166
- return {
167
- backend: 'cpu',
168
- detected: false,
169
- warning: 'No GPU detected. Using CPU variant (performance will be severely limited). Consider using models <= 7B.',
170
- };
171
- }
package/dist/gpu-id.js DELETED
@@ -1,48 +0,0 @@
1
- /**
2
- * GPU ID Management
3
- *
4
- * Generates and persists a stable UUID for this GPU worker.
5
- * Stored in ~/.auxot/gpu-id (or %USERPROFILE%\.auxot\gpu-id on Windows)
6
- */
7
- import { randomUUID } from 'crypto';
8
- import { readFile, writeFile, mkdir } from 'fs/promises';
9
- import { homedir } from 'os';
10
- import { join } from 'path';
11
- const AUXOT_DIR = join(homedir(), '.auxot');
12
- const GPU_ID_FILE = join(AUXOT_DIR, 'gpu-id');
13
- /**
14
- * Get or create a stable GPU ID
15
- *
16
- * Returns the same UUID across restarts of the worker CLI.
17
- */
18
- export async function getOrCreateGpuId() {
19
- try {
20
- // Try to read existing GPU ID
21
- const existingId = await readFile(GPU_ID_FILE, 'utf-8');
22
- const trimmed = existingId.trim();
23
- // Validate it's a UUID
24
- const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
25
- if (uuidRegex.test(trimmed)) {
26
- return trimmed;
27
- }
28
- console.warn('Invalid GPU ID found, generating new one');
29
- }
30
- catch (error) {
31
- // File doesn't exist or can't be read - generate new ID
32
- }
33
- // Generate new UUID
34
- const newId = randomUUID();
35
- try {
36
- // Ensure directory exists
37
- await mkdir(AUXOT_DIR, { recursive: true });
38
- // Write GPU ID
39
- await writeFile(GPU_ID_FILE, newId, 'utf-8');
40
- console.log(`Generated new GPU ID: ${newId}`);
41
- console.log(`Stored in: ${GPU_ID_FILE}`);
42
- }
43
- catch (error) {
44
- console.error('Failed to save GPU ID:', error);
45
- throw error;
46
- }
47
- return newId;
48
- }