@mariozechner/pi 0.2.4 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +392 -294
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +348 -0
- package/dist/cli.js.map +1 -0
- package/dist/commands/models.d.ts +39 -0
- package/dist/commands/models.d.ts.map +1 -0
- package/dist/commands/models.js +612 -0
- package/dist/commands/models.js.map +1 -0
- package/dist/commands/pods.d.ts +21 -0
- package/dist/commands/pods.d.ts.map +1 -0
- package/dist/commands/pods.js +175 -0
- package/dist/commands/pods.js.map +1 -0
- package/dist/commands/prompt.d.ts +7 -0
- package/dist/commands/prompt.d.ts.map +1 -0
- package/dist/commands/prompt.js +55 -0
- package/dist/commands/prompt.js.map +1 -0
- package/dist/config.d.ts +11 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +74 -0
- package/dist/config.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -0
- package/dist/model-configs.d.ts +22 -0
- package/dist/model-configs.d.ts.map +1 -0
- package/dist/model-configs.js +75 -0
- package/dist/model-configs.js.map +1 -0
- package/dist/models.json +305 -0
- package/dist/ssh.d.ts +24 -0
- package/dist/ssh.d.ts.map +1 -0
- package/dist/ssh.js +115 -0
- package/dist/ssh.js.map +1 -0
- package/dist/types.d.ts +23 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/package.json +38 -40
- package/LICENSE +0 -21
- package/pi.js +0 -1379
- package/pod_setup.sh +0 -74
- package/vllm_manager.py +0 -662
package/pi.js
DELETED
|
@@ -1,1379 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
/**
|
|
3
|
-
* pi CLI
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
const fs = require('fs');
|
|
7
|
-
const { execSync, spawn } = require('child_process');
|
|
8
|
-
const path = require('path');
|
|
9
|
-
const os = require('os');
|
|
10
|
-
|
|
11
|
-
const CONFIG_FILE = path.join(os.homedir(), '.pi_config');
|
|
12
|
-
const SCRIPT_DIR = __dirname;
|
|
13
|
-
|
|
14
|
-
class PiCli {
|
|
15
|
-
constructor() {
|
|
16
|
-
this.loadConfig();
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
loadConfig() {
|
|
20
|
-
if (fs.existsSync(CONFIG_FILE)) {
|
|
21
|
-
this.config = JSON.parse(fs.readFileSync(CONFIG_FILE, 'utf8'));
|
|
22
|
-
// Migrate old single-pod config
|
|
23
|
-
if (this.config.ssh && !this.config.pods) {
|
|
24
|
-
this.config = {
|
|
25
|
-
pods: { 'default': { ssh: this.config.ssh } },
|
|
26
|
-
active: 'default'
|
|
27
|
-
};
|
|
28
|
-
this.saveConfig();
|
|
29
|
-
}
|
|
30
|
-
} else {
|
|
31
|
-
this.config = { pods: {}, active: null };
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
saveConfig() {
|
|
36
|
-
fs.writeFileSync(CONFIG_FILE, JSON.stringify(this.config, null, 2));
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
getActivePod() {
|
|
40
|
-
if (!this.config.active || !this.config.pods[this.config.active]) {
|
|
41
|
-
return null;
|
|
42
|
-
}
|
|
43
|
-
return this.config.pods[this.config.active];
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
ssh(command, interactive = false, skipPirc = false, podName = null) {
|
|
47
|
-
const pod = podName ? this.config.pods[podName] : this.getActivePod();
|
|
48
|
-
if (!pod) {
|
|
49
|
-
if (podName) {
|
|
50
|
-
console.error(`Pod '${podName}' not found`);
|
|
51
|
-
console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
|
|
52
|
-
} else {
|
|
53
|
-
console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
|
|
54
|
-
console.error('Example: pi setup prod "root@135.181.71.41 -p 22"');
|
|
55
|
-
console.error('Or activate an existing pod: pi pod <pod-name>');
|
|
56
|
-
}
|
|
57
|
-
process.exit(1);
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
// Wrap command to source .pirc first (if it exists), unless skipPirc is true
|
|
61
|
-
const finalCommand = skipPirc ? command : `[ -f ~/.pirc ] && source ~/.pirc; ${command}`;
|
|
62
|
-
|
|
63
|
-
if (interactive) {
|
|
64
|
-
// For interactive commands, use spawn with shell
|
|
65
|
-
const sshParts = pod.ssh.split(' ');
|
|
66
|
-
const sshCmd = ['ssh', ...sshParts, finalCommand];
|
|
67
|
-
const proc = spawn(sshCmd[0], sshCmd.slice(1), { stdio: 'inherit', shell: false });
|
|
68
|
-
return new Promise((resolve) => {
|
|
69
|
-
proc.on('close', resolve);
|
|
70
|
-
});
|
|
71
|
-
} else {
|
|
72
|
-
const sshCmd = `ssh ${pod.ssh} ${JSON.stringify(finalCommand)}`;
|
|
73
|
-
|
|
74
|
-
// For non-interactive, use execSync
|
|
75
|
-
try {
|
|
76
|
-
return execSync(sshCmd, { encoding: 'utf8' });
|
|
77
|
-
} catch (e) {
|
|
78
|
-
if (e.status !== 0) {
|
|
79
|
-
console.error('SSH command failed:', e.message);
|
|
80
|
-
process.exit(1);
|
|
81
|
-
}
|
|
82
|
-
throw e;
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
scp(localFile, remotePath = '~/', podName = null) {
|
|
88
|
-
const pod = podName ? this.config.pods[podName] : this.getActivePod();
|
|
89
|
-
if (!pod) {
|
|
90
|
-
if (podName) {
|
|
91
|
-
console.error(`Pod '${podName}' not found`);
|
|
92
|
-
} else {
|
|
93
|
-
console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
|
|
94
|
-
}
|
|
95
|
-
process.exit(1);
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
const [userHost, ...sshArgs] = pod.ssh.split(' ');
|
|
99
|
-
let scpCmd = `scp`;
|
|
100
|
-
|
|
101
|
-
// Add port if specified
|
|
102
|
-
const portArg = sshArgs.find(arg => arg === '-p');
|
|
103
|
-
if (portArg) {
|
|
104
|
-
const portIndex = sshArgs.indexOf(portArg);
|
|
105
|
-
const port = sshArgs[portIndex + 1];
|
|
106
|
-
scpCmd += ` -P ${port}`;
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
scpCmd += ` ${localFile} ${userHost}:${remotePath}`;
|
|
110
|
-
|
|
111
|
-
try {
|
|
112
|
-
execSync(scpCmd, { stdio: 'inherit' });
|
|
113
|
-
} catch (e) {
|
|
114
|
-
console.error('SCP failed:', e.message);
|
|
115
|
-
process.exit(1);
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
async setup(podName, sshCommand) {
|
|
120
|
-
if (!podName || !sshCommand) {
|
|
121
|
-
console.error('Usage: pi setup <pod-name> <ssh_command>');
|
|
122
|
-
console.error('Example: pi setup prod "root@135.181.71.41 -p 22"');
|
|
123
|
-
process.exit(1);
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
// Remove "ssh " prefix if present
|
|
127
|
-
if (sshCommand.toLowerCase().startsWith('ssh ')) {
|
|
128
|
-
sshCommand = sshCommand.substring(4);
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
// Save pod config
|
|
132
|
-
if (!this.config.pods) {
|
|
133
|
-
this.config.pods = {};
|
|
134
|
-
}
|
|
135
|
-
this.config.pods[podName] = { ssh: sshCommand };
|
|
136
|
-
this.config.active = podName;
|
|
137
|
-
this.saveConfig();
|
|
138
|
-
console.log(`Saved pod '${podName}' with SSH: ${sshCommand}`);
|
|
139
|
-
|
|
140
|
-
// Test connection
|
|
141
|
-
console.log('\nTesting SSH connection...');
|
|
142
|
-
try {
|
|
143
|
-
const hostname = this.ssh('hostname', false, true).trim();
|
|
144
|
-
console.log(`✓ Connected to ${hostname}`);
|
|
145
|
-
} catch (e) {
|
|
146
|
-
console.error('✗ SSH connection failed');
|
|
147
|
-
process.exit(1);
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
// Copy setup files
|
|
151
|
-
console.log('\nCopying setup files...');
|
|
152
|
-
this.scp(path.join(SCRIPT_DIR, 'pod_setup.sh'));
|
|
153
|
-
this.scp(path.join(SCRIPT_DIR, 'vllm_manager.py'));
|
|
154
|
-
|
|
155
|
-
// Run setup with HF_TOKEN
|
|
156
|
-
console.log('\nRunning setup script...');
|
|
157
|
-
const hfToken = process.env.HF_TOKEN;
|
|
158
|
-
if (!hfToken) {
|
|
159
|
-
console.error('\nERROR: HF_TOKEN environment variable not set');
|
|
160
|
-
console.error('Please export HF_TOKEN before running setup');
|
|
161
|
-
process.exit(1);
|
|
162
|
-
}
|
|
163
|
-
await this.ssh(`export HF_TOKEN="${hfToken}" && bash pod_setup.sh`, true, true);
|
|
164
|
-
|
|
165
|
-
console.log('\n✓ Setup complete!');
|
|
166
|
-
|
|
167
|
-
// Show usage help
|
|
168
|
-
this.showHelp();
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
list(podName = null) {
|
|
172
|
-
const output = this.ssh('python3 vllm_manager.py list', false, false, podName);
|
|
173
|
-
console.log(output);
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
parseContextSize(value) {
|
|
177
|
-
if (!value) return 8192;
|
|
178
|
-
|
|
179
|
-
// Convert string to lowercase for case-insensitive matching
|
|
180
|
-
const lower = value.toString().toLowerCase();
|
|
181
|
-
|
|
182
|
-
// Handle 'k' suffix (4k, 8k, 32k, etc)
|
|
183
|
-
if (lower.endsWith('k')) {
|
|
184
|
-
return parseInt(lower.slice(0, -1)) * 1024;
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
// Handle plain numbers
|
|
188
|
-
return parseInt(value);
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
parseMemory(value) {
|
|
192
|
-
if (!value) return 0.9;
|
|
193
|
-
|
|
194
|
-
const str = value.toString().toLowerCase();
|
|
195
|
-
|
|
196
|
-
// Handle percentage (30%, 50%, etc)
|
|
197
|
-
if (str.endsWith('%')) {
|
|
198
|
-
return parseInt(str.slice(0, -1)) / 100;
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
// Handle decimal (0.3, 0.5, etc)
|
|
202
|
-
const num = parseFloat(str);
|
|
203
|
-
if (num > 1) {
|
|
204
|
-
console.error('Memory must be between 0-1 or 0-100%');
|
|
205
|
-
process.exit(1);
|
|
206
|
-
}
|
|
207
|
-
return num;
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
async handleStart(args) {
|
|
211
|
-
if (!args[0]) {
|
|
212
|
-
console.error('Usage: pi start <model> [options]');
|
|
213
|
-
console.error('');
|
|
214
|
-
console.error('Options:');
|
|
215
|
-
console.error(' --name <name> Model alias (default: auto-generated)');
|
|
216
|
-
console.error(' --context <size> Context window: 4k, 8k, 16k, 32k, 64k, 128k or 4096, 8192, etc (default: model default)');
|
|
217
|
-
console.error(' --memory <percent> GPU memory: 30%, 50%, 90% or 0.3, 0.5, 0.9 (default: 90%)');
|
|
218
|
-
console.error(' --all-gpus Use all GPUs with tensor parallelism (ignores --memory)');
|
|
219
|
-
console.error(' --debug Enable debug logging for vLLM');
|
|
220
|
-
console.error(' --pod <name> Run on specific pod (default: active pod)');
|
|
221
|
-
console.error(' --vllm-args Pass remaining args directly to vLLM (ignores other options)');
|
|
222
|
-
console.error('');
|
|
223
|
-
console.error('Examples:');
|
|
224
|
-
console.error(' pi start Qwen/Qwen2.5-7B-Instruct');
|
|
225
|
-
console.error(' pi start Qwen/Qwen2.5-7B-Instruct --name qwen --memory 20%');
|
|
226
|
-
console.error(' pi start meta-llama/Llama-3.1-70B-Instruct --all-gpus');
|
|
227
|
-
console.error(' pi start meta-llama/Llama-3.1-405B --all-gpus --context 128k');
|
|
228
|
-
console.error('');
|
|
229
|
-
console.error(' # Custom vLLM args for Qwen3-Coder on 8xH200:');
|
|
230
|
-
console.error(' pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-coder --vllm-args \\\\');
|
|
231
|
-
console.error(' --data-parallel-size 8 --enable-expert-parallel \\\\');
|
|
232
|
-
console.error(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --gpu-memory-utilization 0.95 --max-model-len 200000');
|
|
233
|
-
process.exit(1);
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
const modelId = args[0];
|
|
237
|
-
let name = null;
|
|
238
|
-
let context = null; // Changed to null - let vLLM use model default
|
|
239
|
-
let memory = 0.9;
|
|
240
|
-
let allGpus = false;
|
|
241
|
-
let debug = false;
|
|
242
|
-
let vllmArgs = null;
|
|
243
|
-
let podName = null;
|
|
244
|
-
|
|
245
|
-
// Check for --vllm-args first
|
|
246
|
-
const vllmArgsIndex = args.indexOf('--vllm-args');
|
|
247
|
-
if (vllmArgsIndex !== -1) {
|
|
248
|
-
// Extract name and pod if provided before --vllm-args
|
|
249
|
-
for (let i = 1; i < vllmArgsIndex; i++) {
|
|
250
|
-
if (args[i] === '--name' && args[i + 1]) {
|
|
251
|
-
name = args[++i];
|
|
252
|
-
} else if (args[i] === '--pod' && args[i + 1]) {
|
|
253
|
-
podName = args[++i];
|
|
254
|
-
} else if (args[i] === '--debug') {
|
|
255
|
-
debug = true;
|
|
256
|
-
}
|
|
257
|
-
}
|
|
258
|
-
// Everything after --vllm-args is passed to vLLM
|
|
259
|
-
vllmArgs = args.slice(vllmArgsIndex + 1).join(' ');
|
|
260
|
-
} else {
|
|
261
|
-
// Parse normal arguments
|
|
262
|
-
for (let i = 1; i < args.length; i++) {
|
|
263
|
-
switch (args[i]) {
|
|
264
|
-
case '--name':
|
|
265
|
-
name = args[++i];
|
|
266
|
-
break;
|
|
267
|
-
case '--context':
|
|
268
|
-
context = this.parseContextSize(args[++i]);
|
|
269
|
-
break;
|
|
270
|
-
case '--memory':
|
|
271
|
-
memory = this.parseMemory(args[++i]);
|
|
272
|
-
break;
|
|
273
|
-
case '--all-gpus':
|
|
274
|
-
allGpus = true;
|
|
275
|
-
break;
|
|
276
|
-
case '--debug':
|
|
277
|
-
debug = true;
|
|
278
|
-
break;
|
|
279
|
-
case '--pod':
|
|
280
|
-
podName = args[++i];
|
|
281
|
-
break;
|
|
282
|
-
default:
|
|
283
|
-
console.error(`Unknown option: ${args[i]}`);
|
|
284
|
-
process.exit(1);
|
|
285
|
-
}
|
|
286
|
-
}
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
// Check for multi-GPU setup
|
|
290
|
-
const gpuCount = await this.getGpuCount(podName);
|
|
291
|
-
|
|
292
|
-
if (allGpus) {
|
|
293
|
-
if (memory !== 0.9) {
|
|
294
|
-
console.log('Warning: --memory ignored with --all-gpus (using 95% memory across all GPUs)');
|
|
295
|
-
}
|
|
296
|
-
memory = 0.95;
|
|
297
|
-
|
|
298
|
-
if (gpuCount === 1) {
|
|
299
|
-
console.log('Note: --all-gpus specified but only 1 GPU found');
|
|
300
|
-
allGpus = false;
|
|
301
|
-
}
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
// Auto-generate name if not provided
|
|
305
|
-
if (!name) {
|
|
306
|
-
// Extract model name from path (e.g., "Phi-3-mini" from "microsoft/Phi-3-mini-4k-instruct")
|
|
307
|
-
const parts = modelId.split('/');
|
|
308
|
-
const modelName = parts[parts.length - 1];
|
|
309
|
-
name = modelName.toLowerCase()
|
|
310
|
-
.replace(/-instruct$/, '')
|
|
311
|
-
.replace(/-chat$/, '')
|
|
312
|
-
.replace(/[^a-z0-9-]/g, '-')
|
|
313
|
-
.replace(/-+/g, '-')
|
|
314
|
-
.replace(/^-|-$/g, '')
|
|
315
|
-
.slice(0, 20);
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
// If vllmArgs provided, skip memory check since we don't know the parallelism strategy
|
|
319
|
-
if (vllmArgs) {
|
|
320
|
-
const modelEstimate = await this.getModelMemoryEstimate(modelId, context);
|
|
321
|
-
if (modelEstimate) {
|
|
322
|
-
console.log(`Model weights: ${modelEstimate.modelSizeGB.toFixed(1)}GB`);
|
|
323
|
-
console.log(`Context length: ${modelEstimate.contextLength.toLocaleString()} tokens`);
|
|
324
|
-
}
|
|
325
|
-
console.log(`Target pod: ${podName || this.config.active || 'active pod'}`);
|
|
326
|
-
await this.startRaw(modelId, name, vllmArgs, debug, podName);
|
|
327
|
-
return;
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
// For standard deployment, check memory
|
|
331
|
-
const modelEstimate = await this.getModelMemoryEstimate(modelId, context);
|
|
332
|
-
|
|
333
|
-
// Check GPU memory before starting
|
|
334
|
-
console.log('Checking model size and GPU memory...');
|
|
335
|
-
console.log(`Target pod: ${podName || this.config.active || 'active pod'}`);
|
|
336
|
-
const [memoryInfo, modelEstimateWithContext] = await Promise.all([
|
|
337
|
-
this.getGpuMemoryInfo(podName),
|
|
338
|
-
modelEstimate
|
|
339
|
-
]);
|
|
340
|
-
|
|
341
|
-
if (memoryInfo && modelEstimateWithContext) {
|
|
342
|
-
// For tensor parallel (--all-gpus), memory is distributed across GPUs
|
|
343
|
-
const effectiveMemoryNeeded = allGpus && gpuCount > 1
|
|
344
|
-
? modelEstimateWithContext.estimatedMemoryGB / gpuCount
|
|
345
|
-
: modelEstimateWithContext.estimatedMemoryGB;
|
|
346
|
-
|
|
347
|
-
const memoryPerGpu = memoryInfo.freeMemoryGB / (gpuCount || 1);
|
|
348
|
-
|
|
349
|
-
console.log(`Model weights: ${modelEstimateWithContext.modelSizeGB.toFixed(1)}GB`);
|
|
350
|
-
console.log(`Context length: ${modelEstimateWithContext.contextLength.toLocaleString()} tokens`);
|
|
351
|
-
console.log(`Note: Estimate includes model parameters only, not KV cache for context`);
|
|
352
|
-
console.log(`Available GPU memory: ${memoryInfo.freeMemoryGB.toFixed(1)}GB total (${memoryPerGpu.toFixed(1)}GB per GPU)`);
|
|
353
|
-
|
|
354
|
-
if (effectiveMemoryNeeded > memoryPerGpu) {
|
|
355
|
-
// Log a BIG WARNING as requested
|
|
356
|
-
console.error(`\n❌ BIG WARNING: Insufficient GPU memory`);
|
|
357
|
-
if (allGpus && gpuCount > 1) {
|
|
358
|
-
console.error(` Model needs ~${effectiveMemoryNeeded.toFixed(1)}GB per GPU but only ${memoryPerGpu.toFixed(1)}GB available`);
|
|
359
|
-
} else {
|
|
360
|
-
console.error(` Model needs ~${modelEstimateWithContext.estimatedMemoryGB.toFixed(1)}GB but only ${memoryInfo.freeMemoryGB.toFixed(1)}GB available`);
|
|
361
|
-
}
|
|
362
|
-
console.error('\n Free up memory by stopping running models:');
|
|
363
|
-
console.error(' pi list # See running models');
|
|
364
|
-
console.error(' pi stop <model_name> # Stop specific model');
|
|
365
|
-
console.error(' pi stop # Stop all models\n');
|
|
366
|
-
// Don't exit, just warn and proceed
|
|
367
|
-
}
|
|
368
|
-
}
|
|
369
|
-
|
|
370
|
-
// Call the original start method with positional args
|
|
371
|
-
const contextStr = context ? context.toString() : null;
|
|
372
|
-
await this.start(modelId, name, contextStr, memory.toString(), { allGpus, gpuCount, debug, podName });
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
async getGpuCount(podName = null) {
|
|
376
|
-
try {
|
|
377
|
-
const output = this.ssh('nvidia-smi --query-gpu=name --format=csv,noheader | wc -l', false, false, podName);
|
|
378
|
-
return parseInt(output.trim()) || 1;
|
|
379
|
-
} catch {
|
|
380
|
-
return 1;
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
async getGpuMemoryInfo(podName = null) {
|
|
385
|
-
try {
|
|
386
|
-
const output = this.ssh('nvidia-smi --query-gpu=memory.total,memory.free --format=csv,noheader,nounits', false, false, podName);
|
|
387
|
-
const lines = output.trim().split('\n');
|
|
388
|
-
let totalMemoryGB = 0;
|
|
389
|
-
let freeMemoryGB = 0;
|
|
390
|
-
|
|
391
|
-
for (const line of lines) {
|
|
392
|
-
const [total, free] = line.split(',').map(x => parseInt(x.trim()));
|
|
393
|
-
totalMemoryGB += total / 1024;
|
|
394
|
-
freeMemoryGB += free / 1024;
|
|
395
|
-
}
|
|
396
|
-
|
|
397
|
-
return { totalMemoryGB, freeMemoryGB };
|
|
398
|
-
} catch (e) {
|
|
399
|
-
return null;
|
|
400
|
-
}
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
async getModelMemoryEstimate(modelId, contextLength = null) {
|
|
404
|
-
try {
|
|
405
|
-
const response = await fetch(`https://huggingface.co/api/models/${modelId}`);
|
|
406
|
-
const data = await response.json();
|
|
407
|
-
|
|
408
|
-
if (data.safetensors?.parameters) {
|
|
409
|
-
// Calculate actual model size based on parameter counts and types
|
|
410
|
-
const dtypeSizes = {
|
|
411
|
-
'F64': 8, // float64 - 8 bytes
|
|
412
|
-
'F32': 4, // float32 - 4 bytes
|
|
413
|
-
'BF16': 2, // bfloat16 - 2 bytes
|
|
414
|
-
'F16': 2, // float16 - 2 bytes
|
|
415
|
-
'I32': 4, // int32 - 4 bytes
|
|
416
|
-
'I16': 2, // int16 - 2 bytes
|
|
417
|
-
'I8': 1, // int8 - 1 byte
|
|
418
|
-
'U8': 1, // uint8 - 1 byte
|
|
419
|
-
'I4': 0.5, // int4 - 0.5 bytes (packed)
|
|
420
|
-
'F8_E4M3': 1, // FP8 E4M3 format - 1 byte
|
|
421
|
-
'F8_E5M2': 1, // FP8 E5M2 format - 1 byte
|
|
422
|
-
'Q8_0': 1, // GGML quantization formats
|
|
423
|
-
'Q4_0': 0.5, // GGML quantization formats
|
|
424
|
-
'Q4_1': 0.5, // GGML quantization formats
|
|
425
|
-
'Q5_0': 0.625, // GGML quantization formats
|
|
426
|
-
'Q5_1': 0.625 // GGML quantization formats
|
|
427
|
-
};
|
|
428
|
-
|
|
429
|
-
let totalBytes = 0;
|
|
430
|
-
let paramDetails = [];
|
|
431
|
-
|
|
432
|
-
// Calculate bytes for each dtype
|
|
433
|
-
let unknownDtypes = [];
|
|
434
|
-
for (const [dtype, paramCount] of Object.entries(data.safetensors.parameters)) {
|
|
435
|
-
let bytesPerParam = dtypeSizes[dtype];
|
|
436
|
-
if (bytesPerParam === undefined) {
|
|
437
|
-
// Unknown dtype - assume 1 byte (most new formats are quantized)
|
|
438
|
-
bytesPerParam = 1; // Conservative for memory checking
|
|
439
|
-
unknownDtypes.push(dtype);
|
|
440
|
-
}
|
|
441
|
-
const bytes = paramCount * bytesPerParam;
|
|
442
|
-
totalBytes += bytes;
|
|
443
|
-
paramDetails.push({ dtype, count: paramCount, bytes });
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
if (unknownDtypes.length > 0) {
|
|
447
|
-
console.warn(`Unknown dtype(s) found: ${unknownDtypes.join(', ')}. Assuming 1 byte per parameter.`);
|
|
448
|
-
}
|
|
449
|
-
|
|
450
|
-
const modelSizeGB = totalBytes / (1024 ** 3);
|
|
451
|
-
|
|
452
|
-
// Try to get model config for context length
|
|
453
|
-
let maxContextLength = contextLength;
|
|
454
|
-
try {
|
|
455
|
-
const configResponse = await fetch(`https://huggingface.co/${modelId}/raw/main/config.json`);
|
|
456
|
-
if (configResponse.ok) {
|
|
457
|
-
const config = await configResponse.json();
|
|
458
|
-
maxContextLength = contextLength || config.max_position_embeddings || 8192;
|
|
459
|
-
}
|
|
460
|
-
} catch (e) {
|
|
461
|
-
maxContextLength = contextLength || 8192;
|
|
462
|
-
}
|
|
463
|
-
|
|
464
|
-
return {
|
|
465
|
-
modelSizeGB,
|
|
466
|
-
estimatedMemoryGB: modelSizeGB, // Only model weights, not KV cache
|
|
467
|
-
contextLength: maxContextLength,
|
|
468
|
-
paramDetails // For debugging
|
|
469
|
-
};
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
return null;
|
|
473
|
-
} catch (e) {
|
|
474
|
-
return null;
|
|
475
|
-
}
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
async start(modelId, name, maxLen = null, gpuMemory, options = {}) {
|
|
479
|
-
// Check if name is already in use locally first
|
|
480
|
-
if (name) {
|
|
481
|
-
const runningModels = this.getRunningModels(options.podName);
|
|
482
|
-
if (runningModels[name]) {
|
|
483
|
-
console.error(`Error: Model name '${name}' is already in use`);
|
|
484
|
-
console.error('Running models:', Object.keys(runningModels).join(', '));
|
|
485
|
-
process.exit(1);
|
|
486
|
-
}
|
|
487
|
-
}
|
|
488
|
-
|
|
489
|
-
// Memory check is already done in handleStart, skip it here
|
|
490
|
-
|
|
491
|
-
// Build args for vllm_manager.py
|
|
492
|
-
let args = modelId;
|
|
493
|
-
|
|
494
|
-
// Handle optional parameters
|
|
495
|
-
if (name || maxLen || gpuMemory) {
|
|
496
|
-
args += ` ${name || '""'}`;
|
|
497
|
-
|
|
498
|
-
if (maxLen || gpuMemory) {
|
|
499
|
-
args += ` ${maxLen || '""'}`; // Pass empty string to use vLLM default
|
|
500
|
-
|
|
501
|
-
if (gpuMemory) {
|
|
502
|
-
args += ` ${gpuMemory}`;
|
|
503
|
-
}
|
|
504
|
-
}
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
// Handle multi-GPU options
|
|
508
|
-
let envPrefix = '';
|
|
509
|
-
if (options.allGpus && options.gpuCount > 1) {
|
|
510
|
-
args += ` ${options.gpuCount}`; // Pass tensor parallel size
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
// Add debug logging if requested
|
|
514
|
-
if (options.debug) {
|
|
515
|
-
envPrefix = 'VLLM_LOGGING_LEVEL=DEBUG ';
|
|
516
|
-
}
|
|
517
|
-
|
|
518
|
-
const output = this.ssh(`${envPrefix}python3 vllm_manager.py start ${args}`, false, false, options.podName);
|
|
519
|
-
|
|
520
|
-
// Extract model name and connection info from output
|
|
521
|
-
const nameMatch = output.match(/Started (\S+)/);
|
|
522
|
-
const urlMatch = output.match(/URL: (http:\/\/[^\s]+)/);
|
|
523
|
-
const exportMatch = output.match(/export OPENAI_BASE_URL='([^']+)'/);
|
|
524
|
-
|
|
525
|
-
if (nameMatch) {
|
|
526
|
-
const modelName = nameMatch[1];
|
|
527
|
-
const url = urlMatch ? urlMatch[1] : null;
|
|
528
|
-
const exportCmd = exportMatch ? `export OPENAI_BASE_URL='${exportMatch[1]}'` : null;
|
|
529
|
-
|
|
530
|
-
console.log(`\nStarted ${modelName}`);
|
|
531
|
-
console.log('Waiting for model to initialize...\n');
|
|
532
|
-
|
|
533
|
-
// Set up Ctrl+C handler for manual interruption
|
|
534
|
-
const showModelInfo = () => {
|
|
535
|
-
console.log('\n\n' + '='.repeat(60));
|
|
536
|
-
console.log('Model Information:');
|
|
537
|
-
console.log('='.repeat(60));
|
|
538
|
-
console.log(`Name: ${modelName}`);
|
|
539
|
-
if (url) console.log(`URL: ${url}`);
|
|
540
|
-
if (exportCmd) {
|
|
541
|
-
console.log(`\nTo use with OpenAI clients:`);
|
|
542
|
-
console.log(exportCmd);
|
|
543
|
-
console.log(`export OPENAI_API_KEY='dummy'`);
|
|
544
|
-
console.log(`export OPENAI_MODEL='${modelId}'`);
|
|
545
|
-
}
|
|
546
|
-
console.log('='.repeat(60));
|
|
547
|
-
};
|
|
548
|
-
|
|
549
|
-
process.on('SIGINT', () => {
|
|
550
|
-
showModelInfo();
|
|
551
|
-
process.exit(0);
|
|
552
|
-
});
|
|
553
|
-
|
|
554
|
-
// Watch logs until startup complete
|
|
555
|
-
await this.logs(modelName, true, options.podName); // autoExit = true for startup
|
|
556
|
-
|
|
557
|
-
// Warm up the model with a simple prompt
|
|
558
|
-
console.log('\nWarming up model...');
|
|
559
|
-
try {
|
|
560
|
-
const warmupUrl = `${url}/chat/completions`;
|
|
561
|
-
const warmupPayload = {
|
|
562
|
-
model: modelId,
|
|
563
|
-
messages: [{ role: 'user', content: 'Hi' }],
|
|
564
|
-
max_tokens: 1,
|
|
565
|
-
temperature: 0
|
|
566
|
-
};
|
|
567
|
-
|
|
568
|
-
const warmupResponse = await fetch(warmupUrl, {
|
|
569
|
-
method: 'POST',
|
|
570
|
-
headers: { 'Content-Type': 'application/json' },
|
|
571
|
-
body: JSON.stringify(warmupPayload)
|
|
572
|
-
});
|
|
573
|
-
|
|
574
|
-
if (warmupResponse.ok) {
|
|
575
|
-
console.log('✓ Model warmed up and ready!');
|
|
576
|
-
} else {
|
|
577
|
-
console.log('⚠ Warmup failed, but model should still work');
|
|
578
|
-
}
|
|
579
|
-
} catch (e) {
|
|
580
|
-
console.log('⚠ Could not warm up model:', e.message);
|
|
581
|
-
}
|
|
582
|
-
|
|
583
|
-
// Show model info after warmup
|
|
584
|
-
showModelInfo();
|
|
585
|
-
} else {
|
|
586
|
-
console.log(output);
|
|
587
|
-
}
|
|
588
|
-
}
|
|
589
|
-
|
|
590
|
-
async startRaw(modelId, name, vllmArgs, debug = false, podName = null) {
|
|
591
|
-
// Skip memory check for raw vLLM args since we don't know what custom settings are used
|
|
592
|
-
console.log('Note: Memory checking disabled when using --vllm-args');
|
|
593
|
-
// Check if name is already in use
|
|
594
|
-
const runningModels = this.getRunningModels(podName);
|
|
595
|
-
if (runningModels[name]) {
|
|
596
|
-
console.error(`Error: Model name '${name}' is already in use`);
|
|
597
|
-
console.error('Running models:', Object.keys(runningModels).join(', '));
|
|
598
|
-
process.exit(1);
|
|
599
|
-
}
|
|
600
|
-
|
|
601
|
-
console.log(`Starting ${name} with custom vLLM args on pod: ${podName || this.config.active || 'active pod'}`);
|
|
602
|
-
|
|
603
|
-
// Start vLLM with raw arguments - use base64 to safely pass complex args
|
|
604
|
-
const base64Args = Buffer.from(vllmArgs).toString('base64');
|
|
605
|
-
const envPrefix = debug ? 'VLLM_LOGGING_LEVEL=DEBUG ' : '';
|
|
606
|
-
const output = this.ssh(`${envPrefix}python3 vllm_manager.py start_raw "${modelId}" "${name}" "${base64Args}"`, false, false, podName);
|
|
607
|
-
|
|
608
|
-
// Extract connection info from output
|
|
609
|
-
const urlMatch = output.match(/URL: (http:\/\/[^\s]+)/);
|
|
610
|
-
const exportMatch = output.match(/export OPENAI_BASE_URL='([^']+)'/);
|
|
611
|
-
|
|
612
|
-
if (urlMatch || exportMatch) {
|
|
613
|
-
const url = urlMatch ? urlMatch[1] : null;
|
|
614
|
-
const exportCmd = exportMatch ? `export OPENAI_BASE_URL='${exportMatch[1]}'` : null;
|
|
615
|
-
|
|
616
|
-
console.log(`\nStarted ${name}`);
|
|
617
|
-
console.log('Waiting for model to initialize...\n');
|
|
618
|
-
|
|
619
|
-
// Set up Ctrl+C handler for manual interruption
|
|
620
|
-
const showModelInfo = () => {
|
|
621
|
-
console.log('\n\n' + '='.repeat(60));
|
|
622
|
-
console.log('Model Information:');
|
|
623
|
-
console.log('='.repeat(60));
|
|
624
|
-
console.log(`Name: ${name}`);
|
|
625
|
-
if (url) console.log(`URL: ${url}`);
|
|
626
|
-
if (exportCmd) {
|
|
627
|
-
console.log(`\nTo use with OpenAI clients:`);
|
|
628
|
-
console.log(exportCmd);
|
|
629
|
-
console.log(`export OPENAI_API_KEY='dummy'`);
|
|
630
|
-
console.log(`export OPENAI_MODEL='${modelId}'`);
|
|
631
|
-
}
|
|
632
|
-
console.log('='.repeat(60));
|
|
633
|
-
};
|
|
634
|
-
|
|
635
|
-
process.on('SIGINT', () => {
|
|
636
|
-
showModelInfo();
|
|
637
|
-
process.exit(0);
|
|
638
|
-
});
|
|
639
|
-
|
|
640
|
-
// Watch logs until startup complete
|
|
641
|
-
await this.logs(name, true, podName); // autoExit = true for startup
|
|
642
|
-
|
|
643
|
-
// Warm up the model with a simple prompt
|
|
644
|
-
console.log('\nWarming up model...');
|
|
645
|
-
try {
|
|
646
|
-
const warmupUrl = `${url}/chat/completions`;
|
|
647
|
-
const warmupPayload = {
|
|
648
|
-
model: modelId,
|
|
649
|
-
messages: [{ role: 'user', content: 'Hi' }],
|
|
650
|
-
max_tokens: 1,
|
|
651
|
-
temperature: 0
|
|
652
|
-
};
|
|
653
|
-
|
|
654
|
-
const warmupResponse = await fetch(warmupUrl, {
|
|
655
|
-
method: 'POST',
|
|
656
|
-
headers: { 'Content-Type': 'application/json' },
|
|
657
|
-
body: JSON.stringify(warmupPayload)
|
|
658
|
-
});
|
|
659
|
-
|
|
660
|
-
if (warmupResponse.ok) {
|
|
661
|
-
console.log('✓ Model warmed up and ready!');
|
|
662
|
-
} else {
|
|
663
|
-
console.log('⚠ Warmup failed, but model should still work');
|
|
664
|
-
}
|
|
665
|
-
} catch (e) {
|
|
666
|
-
console.log('⚠ Could not warm up model:', e.message);
|
|
667
|
-
}
|
|
668
|
-
|
|
669
|
-
// Show model info after warmup
|
|
670
|
-
showModelInfo();
|
|
671
|
-
} else {
|
|
672
|
-
console.log(output);
|
|
673
|
-
}
|
|
674
|
-
}
|
|
675
|
-
|
|
676
|
-
stop(name, podName = null) {
|
|
677
|
-
if (!name) {
|
|
678
|
-
// Stop all models
|
|
679
|
-
const runningModels = this.getRunningModels(podName);
|
|
680
|
-
const modelNames = Object.keys(runningModels);
|
|
681
|
-
|
|
682
|
-
if (modelNames.length === 0) {
|
|
683
|
-
console.log('No models running');
|
|
684
|
-
// Still clean up any hanging vLLM processes
|
|
685
|
-
console.log('Cleaning up any remaining vLLM processes...');
|
|
686
|
-
this.ssh("ps aux | grep -E 'python.*vllm' | grep -v grep | grep -v vllm_manager.py | awk '{print $2}' | xargs -r kill -9 2>/dev/null || true", false, false, podName);
|
|
687
|
-
return;
|
|
688
|
-
}
|
|
689
|
-
|
|
690
|
-
console.log(`Stopping ${modelNames.length} model(s): ${modelNames.join(', ')}`);
|
|
691
|
-
|
|
692
|
-
for (const modelName of modelNames) {
|
|
693
|
-
const output = this.ssh(`python3 vllm_manager.py stop ${modelName}`, false, false, podName);
|
|
694
|
-
console.log(output);
|
|
695
|
-
}
|
|
696
|
-
|
|
697
|
-
// Final cleanup of vLLM processes after stopping all models
|
|
698
|
-
console.log('Ensuring all vLLM processes are terminated...');
|
|
699
|
-
this.ssh("ps aux | grep -E 'python.*vllm' | grep -v grep | grep -v vllm_manager.py | awk '{print $2}' | xargs -r kill -9 2>/dev/null || true", false, false, podName);
|
|
700
|
-
} else {
|
|
701
|
-
// Stop specific model
|
|
702
|
-
const output = this.ssh(`python3 vllm_manager.py stop ${name}`, false, false, podName);
|
|
703
|
-
console.log(output);
|
|
704
|
-
}
|
|
705
|
-
}
|
|
706
|
-
|
|
707
|
-
async logs(name, autoExit = false, podName = null) {
|
|
708
|
-
if (!name) {
|
|
709
|
-
console.error('Usage: pi logs <name>');
|
|
710
|
-
process.exit(1);
|
|
711
|
-
}
|
|
712
|
-
|
|
713
|
-
// Use vllm_manager.py to get the log file path
|
|
714
|
-
const infoOutput = this.ssh(`python3 vllm_manager.py list`, false, false, podName);
|
|
715
|
-
|
|
716
|
-
// Extract log file path from the output
|
|
717
|
-
const lines = infoOutput.split('\n');
|
|
718
|
-
let logFile = null;
|
|
719
|
-
let inModel = false;
|
|
720
|
-
|
|
721
|
-
for (const line of lines) {
|
|
722
|
-
if (line.startsWith(`${name}:`)) {
|
|
723
|
-
inModel = true;
|
|
724
|
-
} else if (inModel && line.includes('Logs:')) {
|
|
725
|
-
logFile = line.split('Logs:')[1].trim();
|
|
726
|
-
break;
|
|
727
|
-
}
|
|
728
|
-
}
|
|
729
|
-
|
|
730
|
-
if (!logFile) {
|
|
731
|
-
console.error(`No logs found for ${name}`);
|
|
732
|
-
process.exit(1);
|
|
733
|
-
}
|
|
734
|
-
|
|
735
|
-
// Use a custom tail that watches for startup complete
|
|
736
|
-
const pod = podName ? this.config.pods[podName] : this.getActivePod();
|
|
737
|
-
// Add SSH options to prevent connection issues
|
|
738
|
-
const sshOpts = '-o ServerAliveInterval=5 -o ServerAliveCountMax=3 -o TCPKeepAlive=yes';
|
|
739
|
-
const sshCmd = `ssh ${sshOpts} ${pod.ssh} tail -n 50 -f ${logFile}`;
|
|
740
|
-
|
|
741
|
-
return new Promise((resolve) => {
|
|
742
|
-
const [cmd, ...args] = sshCmd.split(' ');
|
|
743
|
-
const proc = spawn(cmd, args, { stdio: ['inherit', 'pipe', 'pipe'] });
|
|
744
|
-
|
|
745
|
-
let buffer = '';
|
|
746
|
-
|
|
747
|
-
proc.stdout.on('data', (data) => {
|
|
748
|
-
process.stdout.write(data);
|
|
749
|
-
buffer += data.toString();
|
|
750
|
-
|
|
751
|
-
// Only check for startup messages if autoExit is enabled
|
|
752
|
-
if (autoExit) {
|
|
753
|
-
if (buffer.includes('Application startup complete.') ||
|
|
754
|
-
buffer.includes('Uvicorn running on')) {
|
|
755
|
-
setTimeout(() => {
|
|
756
|
-
proc.kill();
|
|
757
|
-
resolve();
|
|
758
|
-
}, 500); // Small delay to ensure final messages are shown
|
|
759
|
-
}
|
|
760
|
-
}
|
|
761
|
-
|
|
762
|
-
// Keep buffer size manageable
|
|
763
|
-
if (buffer.length > 10000) {
|
|
764
|
-
buffer = buffer.slice(-5000);
|
|
765
|
-
}
|
|
766
|
-
});
|
|
767
|
-
|
|
768
|
-
proc.stderr.on('data', (data) => {
|
|
769
|
-
process.stderr.write(data);
|
|
770
|
-
});
|
|
771
|
-
|
|
772
|
-
proc.on('close', () => {
|
|
773
|
-
resolve();
|
|
774
|
-
});
|
|
775
|
-
});
|
|
776
|
-
}
|
|
777
|
-
|
|
778
|
-
async shell(podName = null) {
|
|
779
|
-
const pod = podName ? this.config.pods[podName] : this.getActivePod();
|
|
780
|
-
if (!pod) {
|
|
781
|
-
if (podName) {
|
|
782
|
-
console.error(`Pod '${podName}' not found`);
|
|
783
|
-
console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
|
|
784
|
-
} else {
|
|
785
|
-
console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
|
|
786
|
-
}
|
|
787
|
-
process.exit(1);
|
|
788
|
-
}
|
|
789
|
-
|
|
790
|
-
console.log(`Connecting to pod${podName ? ` '${podName}'` : ''}...`);
|
|
791
|
-
|
|
792
|
-
// Use spawn directly for interactive shell
|
|
793
|
-
const sshParts = pod.ssh.split(' ');
|
|
794
|
-
const sshCmd = ['ssh', ...sshParts];
|
|
795
|
-
const proc = spawn(sshCmd[0], sshCmd.slice(1), { stdio: 'inherit' });
|
|
796
|
-
|
|
797
|
-
return new Promise((resolve) => {
|
|
798
|
-
proc.on('close', resolve);
|
|
799
|
-
});
|
|
800
|
-
}
|
|
801
|
-
|
|
802
|
-
listPods() {
|
|
803
|
-
if (!this.config.pods || Object.keys(this.config.pods).length === 0) {
|
|
804
|
-
console.log('No pods configured. Run: pi setup <pod-name> <ssh_command>');
|
|
805
|
-
return;
|
|
806
|
-
}
|
|
807
|
-
|
|
808
|
-
console.log('Configured pods:\n');
|
|
809
|
-
|
|
810
|
-
// Show active pod first
|
|
811
|
-
if (this.config.active && this.config.pods[this.config.active]) {
|
|
812
|
-
console.log(`● ${this.config.active} (active)`);
|
|
813
|
-
console.log(` ${this.config.pods[this.config.active].ssh}\n`);
|
|
814
|
-
}
|
|
815
|
-
|
|
816
|
-
// Show other pods
|
|
817
|
-
Object.keys(this.config.pods).sort().forEach(name => {
|
|
818
|
-
if (name !== this.config.active) {
|
|
819
|
-
console.log(`○ ${name}`);
|
|
820
|
-
console.log(` ${this.config.pods[name].ssh}`);
|
|
821
|
-
}
|
|
822
|
-
});
|
|
823
|
-
}
|
|
824
|
-
|
|
825
|
-
switchPod(podName) {
|
|
826
|
-
if (!this.config.pods || !this.config.pods[podName]) {
|
|
827
|
-
console.error(`Pod '${podName}' not found`);
|
|
828
|
-
console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
|
|
829
|
-
process.exit(1);
|
|
830
|
-
}
|
|
831
|
-
|
|
832
|
-
this.config.active = podName;
|
|
833
|
-
this.saveConfig();
|
|
834
|
-
console.log(`Switched to pod: ${podName} (${this.config.pods[podName].ssh})`);
|
|
835
|
-
}
|
|
836
|
-
|
|
837
|
-
removePod(podName) {
|
|
838
|
-
if (!this.config.pods || !this.config.pods[podName]) {
|
|
839
|
-
console.error(`Pod '${podName}' not found`);
|
|
840
|
-
console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
|
|
841
|
-
process.exit(1);
|
|
842
|
-
}
|
|
843
|
-
|
|
844
|
-
delete this.config.pods[podName];
|
|
845
|
-
|
|
846
|
-
// If we removed the active pod, clear it or switch to another
|
|
847
|
-
if (this.config.active === podName) {
|
|
848
|
-
const remainingPods = Object.keys(this.config.pods);
|
|
849
|
-
this.config.active = remainingPods.length > 0 ? remainingPods[0] : null;
|
|
850
|
-
}
|
|
851
|
-
|
|
852
|
-
this.saveConfig();
|
|
853
|
-
console.log(`Removed pod: ${podName}`);
|
|
854
|
-
if (this.config.active) {
|
|
855
|
-
console.log(`Active pod is now: ${this.config.active}`);
|
|
856
|
-
}
|
|
857
|
-
}
|
|
858
|
-
|
|
859
|
-
async searchModels(query) {
|
|
860
|
-
console.log(`Searching HuggingFace for models matching "${query}"...\n`);
|
|
861
|
-
|
|
862
|
-
try {
|
|
863
|
-
const response = await fetch(`https://huggingface.co/api/models?search=${query}&filter=text-generation&sort=downloads&limit=20`);
|
|
864
|
-
const data = await response.json();
|
|
865
|
-
|
|
866
|
-
if (!data || data.length === 0) {
|
|
867
|
-
console.log('No models found');
|
|
868
|
-
return;
|
|
869
|
-
}
|
|
870
|
-
|
|
871
|
-
// Format results
|
|
872
|
-
console.log('Popular models (sorted by downloads):\n');
|
|
873
|
-
for (const model of data) {
|
|
874
|
-
const modelName = model.modelId.toLowerCase();
|
|
875
|
-
|
|
876
|
-
// Skip incompatible formats
|
|
877
|
-
if (modelName.includes('-mlx-') || modelName.includes('-mlx')) {
|
|
878
|
-
continue; // MLX is for Apple Silicon only
|
|
879
|
-
}
|
|
880
|
-
if (modelName.includes('-gguf') || modelName.includes('.gguf')) {
|
|
881
|
-
continue; // GGUF is for llama.cpp, not vLLM
|
|
882
|
-
}
|
|
883
|
-
|
|
884
|
-
const downloads = model.downloads || 0;
|
|
885
|
-
const likes = model.likes || 0;
|
|
886
|
-
|
|
887
|
-
console.log(`\x1b[1m${model.modelId}\x1b[0m`); // Bold
|
|
888
|
-
console.log(` \x1b[36mhttps://huggingface.co/${model.modelId}\x1b[0m`); // Cyan for URL
|
|
889
|
-
console.log(` Downloads: ${downloads.toLocaleString()} | Likes: ${likes}`);
|
|
890
|
-
|
|
891
|
-
// Check for quantization
|
|
892
|
-
if (modelName.includes('-fp8') || modelName.includes('fp8-')) {
|
|
893
|
-
console.log(` \x1b[33mNote: FP8 quantized - requires GPU with FP8 support\x1b[0m`);
|
|
894
|
-
}
|
|
895
|
-
|
|
896
|
-
console.log(` pi start ${model.modelId}`);
|
|
897
|
-
console.log();
|
|
898
|
-
}
|
|
899
|
-
|
|
900
|
-
// Add HuggingFace search URL
|
|
901
|
-
console.log(`\nView more models on HuggingFace:`);
|
|
902
|
-
console.log(`\x1b[36mhttps://huggingface.co/models?search=${encodeURIComponent(query)}&sort=downloads&pipeline_tag=text-generation\x1b[0m`);
|
|
903
|
-
} catch (error) {
|
|
904
|
-
console.error('Error searching models:', error.message);
|
|
905
|
-
}
|
|
906
|
-
}
|
|
907
|
-
|
|
908
|
-
async checkDownloads(podName = null, live = false) {
|
|
909
|
-
// Check only active pod or specified pod
|
|
910
|
-
const targetPod = podName || this.config.active;
|
|
911
|
-
if (!targetPod || !this.config.pods[targetPod]) {
|
|
912
|
-
console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
|
|
913
|
-
process.exit(1);
|
|
914
|
-
}
|
|
915
|
-
|
|
916
|
-
if (!live) {
|
|
917
|
-
// Single check mode
|
|
918
|
-
console.log(`Checking model downloads on pod: ${targetPod}\n`);
|
|
919
|
-
const output = this.ssh('python3 vllm_manager.py downloads', false, false, targetPod);
|
|
920
|
-
|
|
921
|
-
if (output.includes('No HuggingFace cache found') || output.includes('No models in cache')) {
|
|
922
|
-
console.log(output);
|
|
923
|
-
return;
|
|
924
|
-
}
|
|
925
|
-
|
|
926
|
-
// Parse and display
|
|
927
|
-
const downloadInfo = JSON.parse(output);
|
|
928
|
-
this._displayDownloadInfo(downloadInfo);
|
|
929
|
-
} else {
|
|
930
|
-
// Live streaming mode
|
|
931
|
-
const pod = this.config.pods[targetPod];
|
|
932
|
-
// Build SSH command with proper shell invocation
|
|
933
|
-
const sshParts = pod.ssh.split(' ');
|
|
934
|
-
const remoteCmd = 'source .pirc && python3 vllm_manager.py downloads --stream';
|
|
935
|
-
|
|
936
|
-
return new Promise((resolve) => {
|
|
937
|
-
const proc = spawn('ssh', [...sshParts, remoteCmd], { stdio: ['inherit', 'pipe', 'pipe'] });
|
|
938
|
-
|
|
939
|
-
let buffer = '';
|
|
940
|
-
|
|
941
|
-
// Handle Ctrl+C gracefully
|
|
942
|
-
process.on('SIGINT', () => {
|
|
943
|
-
console.log('\n\nStopping download monitor...');
|
|
944
|
-
proc.kill('SIGTERM'); // Send SIGTERM to remote process
|
|
945
|
-
setTimeout(() => {
|
|
946
|
-
proc.kill('SIGKILL'); // Force kill if not terminated
|
|
947
|
-
process.exit(0);
|
|
948
|
-
}, 1000);
|
|
949
|
-
});
|
|
950
|
-
|
|
951
|
-
// Print header once
|
|
952
|
-
console.log(`Monitoring model downloads on pod: ${targetPod} (Press Ctrl+C to stop)`);
|
|
953
|
-
console.log(); // Empty line after header
|
|
954
|
-
|
|
955
|
-
// Hide cursor
|
|
956
|
-
process.stdout.write('\x1B[?25l');
|
|
957
|
-
|
|
958
|
-
// Ensure cursor is shown again on exit
|
|
959
|
-
const cleanup = () => {
|
|
960
|
-
process.stdout.write('\x1B[?25h');
|
|
961
|
-
};
|
|
962
|
-
process.on('exit', cleanup);
|
|
963
|
-
process.on('SIGINT', cleanup);
|
|
964
|
-
|
|
965
|
-
let previousLineCount = 0;
|
|
966
|
-
|
|
967
|
-
proc.stdout.on('data', (data) => {
|
|
968
|
-
buffer += data.toString();
|
|
969
|
-
|
|
970
|
-
// Process complete lines
|
|
971
|
-
const lines = buffer.split('\n');
|
|
972
|
-
buffer = lines[lines.length - 1]; // Keep incomplete line in buffer
|
|
973
|
-
|
|
974
|
-
for (let i = 0; i < lines.length - 1; i++) {
|
|
975
|
-
const line = lines[i].trim();
|
|
976
|
-
if (line) {
|
|
977
|
-
try {
|
|
978
|
-
const downloadInfo = JSON.parse(line);
|
|
979
|
-
|
|
980
|
-
// If we printed lines before, move cursor back up
|
|
981
|
-
if (previousLineCount > 0) {
|
|
982
|
-
process.stdout.write(`\x1B[${previousLineCount}A`); // Move up N lines
|
|
983
|
-
process.stdout.write('\x1B[0J'); // Clear from cursor to end of screen
|
|
984
|
-
}
|
|
985
|
-
|
|
986
|
-
// Build all output as a single string
|
|
987
|
-
let output = '';
|
|
988
|
-
const addLine = (text = '') => {
|
|
989
|
-
output += text + '\n';
|
|
990
|
-
};
|
|
991
|
-
|
|
992
|
-
if (downloadInfo.status === 'NO_CACHE' || downloadInfo.status === 'NO_MODELS') {
|
|
993
|
-
addLine(downloadInfo.message);
|
|
994
|
-
} else {
|
|
995
|
-
// Build the display output
|
|
996
|
-
for (const model of downloadInfo.models) {
|
|
997
|
-
addLine(`Model: ${model.model}`);
|
|
998
|
-
addLine(` Size: ${model.size_gb}GB`);
|
|
999
|
-
|
|
1000
|
-
if (model.total_files > 0) {
|
|
1001
|
-
const percentage = Math.round((model.files / model.total_files) * 100);
|
|
1002
|
-
addLine(` Files: ${model.files}/${model.total_files} (${percentage}%)`);
|
|
1003
|
-
|
|
1004
|
-
// Show progress bar
|
|
1005
|
-
const barLength = 30;
|
|
1006
|
-
const filled = Math.round((percentage / 100) * barLength);
|
|
1007
|
-
const empty = barLength - filled;
|
|
1008
|
-
const progressBar = '█'.repeat(filled) + '░'.repeat(empty);
|
|
1009
|
-
addLine(` Progress: [${progressBar}] ${percentage}%`);
|
|
1010
|
-
} else {
|
|
1011
|
-
addLine(` Files: ${model.files}`);
|
|
1012
|
-
}
|
|
1013
|
-
|
|
1014
|
-
addLine(` Status: ${model.active ? '⏬ Downloading' : '⏸ Idle'}`);
|
|
1015
|
-
addLine(); // Empty line between models
|
|
1016
|
-
}
|
|
1017
|
-
|
|
1018
|
-
if (downloadInfo.vllm_processes > 0) {
|
|
1019
|
-
addLine(`Active vLLM processes: ${downloadInfo.vllm_processes}`);
|
|
1020
|
-
}
|
|
1021
|
-
|
|
1022
|
-
addLine();
|
|
1023
|
-
addLine(`Last updated: ${new Date().toLocaleTimeString()}`);
|
|
1024
|
-
}
|
|
1025
|
-
|
|
1026
|
-
// Write all output at once and count lines
|
|
1027
|
-
process.stdout.write(output);
|
|
1028
|
-
previousLineCount = (output.match(/\n/g) || []).length;
|
|
1029
|
-
|
|
1030
|
-
} catch (e) {
|
|
1031
|
-
// Not JSON, just display as is
|
|
1032
|
-
console.log(line);
|
|
1033
|
-
}
|
|
1034
|
-
}
|
|
1035
|
-
}
|
|
1036
|
-
});
|
|
1037
|
-
|
|
1038
|
-
proc.stderr.on('data', (data) => {
|
|
1039
|
-
process.stderr.write(data);
|
|
1040
|
-
});
|
|
1041
|
-
|
|
1042
|
-
proc.on('close', () => {
|
|
1043
|
-
cleanup(); // Restore cursor
|
|
1044
|
-
resolve();
|
|
1045
|
-
});
|
|
1046
|
-
});
|
|
1047
|
-
}
|
|
1048
|
-
}
|
|
1049
|
-
|
|
1050
|
-
_displayDownloadInfo(downloadInfo) {
|
|
1051
|
-
for (const model of downloadInfo.models) {
|
|
1052
|
-
console.log(`\nModel: ${model.model}`);
|
|
1053
|
-
console.log(` Size: ${model.size_gb}GB`);
|
|
1054
|
-
|
|
1055
|
-
if (model.total_files > 0) {
|
|
1056
|
-
const percentage = Math.round((model.files / model.total_files) * 100);
|
|
1057
|
-
console.log(` Files: ${model.files}/${model.total_files} (${percentage}%)`);
|
|
1058
|
-
|
|
1059
|
-
// Show progress bar
|
|
1060
|
-
const barLength = 30;
|
|
1061
|
-
const filled = Math.round((percentage / 100) * barLength);
|
|
1062
|
-
const empty = barLength - filled;
|
|
1063
|
-
const progressBar = '█'.repeat(filled) + '░'.repeat(empty);
|
|
1064
|
-
console.log(` Progress: [${progressBar}] ${percentage}%`);
|
|
1065
|
-
} else {
|
|
1066
|
-
console.log(` Files: ${model.files}`);
|
|
1067
|
-
}
|
|
1068
|
-
|
|
1069
|
-
console.log(` Status: ${model.active ? '⏬ Downloading' : '⏸ Idle'}`);
|
|
1070
|
-
}
|
|
1071
|
-
|
|
1072
|
-
if (downloadInfo.vllm_processes > 0) {
|
|
1073
|
-
console.log(`\nActive vLLM processes: ${downloadInfo.vllm_processes}`);
|
|
1074
|
-
}
|
|
1075
|
-
|
|
1076
|
-
// Show timestamp
|
|
1077
|
-
console.log(`\nLast updated: ${new Date().toLocaleTimeString()}`);
|
|
1078
|
-
}
|
|
1079
|
-
|
|
1080
|
-
async prompt(name, message, podName = null) {
|
|
1081
|
-
// Get model info
|
|
1082
|
-
const models = this.getRunningModels(podName);
|
|
1083
|
-
const model = models[name];
|
|
1084
|
-
|
|
1085
|
-
if (!model || !model.url) {
|
|
1086
|
-
console.error(`Model '${name}' is not running${podName ? ` on pod '${podName}'` : ''}`);
|
|
1087
|
-
console.error('Running models:', Object.keys(models).join(', ') || 'none');
|
|
1088
|
-
process.exit(1);
|
|
1089
|
-
}
|
|
1090
|
-
|
|
1091
|
-
// Make API call directly to the model's external URL
|
|
1092
|
-
const url = `${model.url}/chat/completions`;
|
|
1093
|
-
const payload = {
|
|
1094
|
-
model: model.model_id,
|
|
1095
|
-
messages: [{ role: 'user', content: message }],
|
|
1096
|
-
max_tokens: 500,
|
|
1097
|
-
temperature: 0.7
|
|
1098
|
-
};
|
|
1099
|
-
|
|
1100
|
-
try {
|
|
1101
|
-
const response = await fetch(url, {
|
|
1102
|
-
method: 'POST',
|
|
1103
|
-
headers: { 'Content-Type': 'application/json' },
|
|
1104
|
-
body: JSON.stringify(payload)
|
|
1105
|
-
});
|
|
1106
|
-
|
|
1107
|
-
if (!response.ok) {
|
|
1108
|
-
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
|
|
1109
|
-
}
|
|
1110
|
-
|
|
1111
|
-
const data = await response.json();
|
|
1112
|
-
console.log(data.choices[0].message.content);
|
|
1113
|
-
} catch (error) {
|
|
1114
|
-
console.error('Error:', error.message);
|
|
1115
|
-
process.exit(1);
|
|
1116
|
-
}
|
|
1117
|
-
}
|
|
1118
|
-
|
|
1119
|
-
showHelp() {
|
|
1120
|
-
console.log('\npi CLI\n');
|
|
1121
|
-
|
|
1122
|
-
console.log('Pod Management:');
|
|
1123
|
-
console.log(' pi setup <pod-name> <ssh_command> Configure and activate a pod');
|
|
1124
|
-
console.log(' pi pods List all pods (active pod marked)');
|
|
1125
|
-
console.log(' pi pod <pod-name> Switch active pod');
|
|
1126
|
-
console.log(' pi pod remove <pod-name> Remove pod from config\n');
|
|
1127
|
-
console.log('Model Management:');
|
|
1128
|
-
console.log(' pi list [--pod <pod-name>] List running models');
|
|
1129
|
-
console.log(' pi search <query> Search HuggingFace models');
|
|
1130
|
-
console.log(' pi start <model> [options] Start a model');
|
|
1131
|
-
console.log(' pi stop [name] [--pod <pod-name>] Stop a model (or all if no name)');
|
|
1132
|
-
console.log(' pi logs <name> [--pod <pod-name>] View model logs');
|
|
1133
|
-
console.log(' pi prompt <name> <msg> [--pod <pod-name>] Chat with a model');
|
|
1134
|
-
console.log(' pi downloads [--pod <pod-name>] [--live] Check model download progress (--live for continuous monitoring)\n');
|
|
1135
|
-
console.log('Start Options:');
|
|
1136
|
-
console.log(' --name <name> Model alias (default: auto-generated)');
|
|
1137
|
-
console.log(' --context <size> Context window: 4k, 8k, 16k, 32k, 64k, 128k (default: model default)');
|
|
1138
|
-
console.log(' --memory <percent> GPU memory: 30%, 50%, 90% (default: 90%)');
|
|
1139
|
-
console.log(' --all-gpus Use all GPUs with tensor parallelism');
|
|
1140
|
-
console.log(' --pod <pod-name> Run on specific pod without switching active pod');
|
|
1141
|
-
console.log(' --debug Enable debug logging for vLLM');
|
|
1142
|
-
console.log(' --vllm-args Pass remaining args directly to vLLM\n');
|
|
1143
|
-
console.log('Utility:');
|
|
1144
|
-
console.log(' pi shell [--pod <pod-name>] SSH into pod');
|
|
1145
|
-
console.log(' pi ssh [--pod <pod-name>] <cmd> Run SSH command on pod');
|
|
1146
|
-
|
|
1147
|
-
console.log('\nQuick Examples:');
|
|
1148
|
-
console.log(' pi start Qwen/Qwen2.5-7B-Instruct --name qwen');
|
|
1149
|
-
console.log(' pi prompt qwen "What is 2+2?"');
|
|
1150
|
-
console.log('\n # Qwen3-Coder on 8xH200 with custom vLLM args:');
|
|
1151
|
-
console.log(' pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-coder --vllm-args \\');
|
|
1152
|
-
console.log(' --data-parallel-size 8 --enable-expert-parallel \\');
|
|
1153
|
-
console.log(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --gpu-memory-utilization 0.95 --max-model-len 200000');
|
|
1154
|
-
|
|
1155
|
-
if (this.config.active && this.config.pods[this.config.active]) {
|
|
1156
|
-
console.log(`\nActive pod: ${this.config.active} (${this.config.pods[this.config.active].ssh})`);
|
|
1157
|
-
} else {
|
|
1158
|
-
console.log('\nNo active pod');
|
|
1159
|
-
}
|
|
1160
|
-
}
|
|
1161
|
-
|
|
1162
|
-
getRunningModels(podName = null) {
|
|
1163
|
-
try {
|
|
1164
|
-
const output = this.ssh('python3 vllm_manager.py list', false, false, podName);
|
|
1165
|
-
const models = {};
|
|
1166
|
-
|
|
1167
|
-
// Parse the output to extract model info
|
|
1168
|
-
const lines = output.split('\n');
|
|
1169
|
-
let currentModel = null;
|
|
1170
|
-
|
|
1171
|
-
for (const line of lines) {
|
|
1172
|
-
if (line.match(/^[a-zA-Z0-9_-]+:$/)) {
|
|
1173
|
-
currentModel = line.slice(0, -1);
|
|
1174
|
-
models[currentModel] = {};
|
|
1175
|
-
} else if (currentModel) {
|
|
1176
|
-
if (line.includes('Model:')) {
|
|
1177
|
-
models[currentModel].model_id = line.split('Model:')[1].trim();
|
|
1178
|
-
} else if (line.includes('Port:')) {
|
|
1179
|
-
models[currentModel].port = parseInt(line.split('Port:')[1].trim());
|
|
1180
|
-
} else if (line.includes('URL:')) {
|
|
1181
|
-
models[currentModel].url = line.split('URL:')[1].trim();
|
|
1182
|
-
}
|
|
1183
|
-
}
|
|
1184
|
-
}
|
|
1185
|
-
|
|
1186
|
-
return models;
|
|
1187
|
-
} catch (e) {
|
|
1188
|
-
return {};
|
|
1189
|
-
}
|
|
1190
|
-
}
|
|
1191
|
-
|
|
1192
|
-
async run() {
|
|
1193
|
-
const [,, command, ...args] = process.argv;
|
|
1194
|
-
|
|
1195
|
-
// Handle --version flag
|
|
1196
|
-
if (command === '--version' || command === '-v') {
|
|
1197
|
-
const packageJsonPath = path.join(__dirname, 'package.json');
|
|
1198
|
-
try {
|
|
1199
|
-
const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf8'));
|
|
1200
|
-
console.log(packageJson.version);
|
|
1201
|
-
} catch (error) {
|
|
1202
|
-
console.error('Error reading version:', error.message);
|
|
1203
|
-
process.exit(1);
|
|
1204
|
-
}
|
|
1205
|
-
return;
|
|
1206
|
-
}
|
|
1207
|
-
|
|
1208
|
-
switch (command) {
|
|
1209
|
-
case 'setup': {
|
|
1210
|
-
if (args.length < 2) {
|
|
1211
|
-
console.error('Usage: pi setup <pod-name> <ssh_command>');
|
|
1212
|
-
console.error('Example: pi setup prod "root@135.181.71.41 -p 22"');
|
|
1213
|
-
process.exit(1);
|
|
1214
|
-
}
|
|
1215
|
-
const podName = args[0];
|
|
1216
|
-
const sshCmd = args.slice(1).join(' ');
|
|
1217
|
-
this.setup(podName, sshCmd);
|
|
1218
|
-
break;
|
|
1219
|
-
}
|
|
1220
|
-
case 'pods':
|
|
1221
|
-
this.listPods();
|
|
1222
|
-
break;
|
|
1223
|
-
|
|
1224
|
-
case 'pod':
|
|
1225
|
-
if (!args[0]) {
|
|
1226
|
-
console.error('Usage: pi pod <pod-name>');
|
|
1227
|
-
console.error(' pi pod remove <pod-name>');
|
|
1228
|
-
process.exit(1);
|
|
1229
|
-
}
|
|
1230
|
-
if (args[0] === 'remove' && args[1]) {
|
|
1231
|
-
this.removePod(args[1]);
|
|
1232
|
-
} else {
|
|
1233
|
-
this.switchPod(args[0]);
|
|
1234
|
-
}
|
|
1235
|
-
break;
|
|
1236
|
-
|
|
1237
|
-
case 'list':
|
|
1238
|
-
case 'ls': {
|
|
1239
|
-
let podName = null;
|
|
1240
|
-
|
|
1241
|
-
// Parse --pod parameter
|
|
1242
|
-
const podIndex = args.indexOf('--pod');
|
|
1243
|
-
if (podIndex !== -1 && args[podIndex + 1]) {
|
|
1244
|
-
podName = args[podIndex + 1];
|
|
1245
|
-
}
|
|
1246
|
-
|
|
1247
|
-
this.list(podName);
|
|
1248
|
-
break;
|
|
1249
|
-
}
|
|
1250
|
-
|
|
1251
|
-
case 'search':
|
|
1252
|
-
if (!args[0]) {
|
|
1253
|
-
console.error('Usage: pi search <query>');
|
|
1254
|
-
console.error('Example: pi search qwen');
|
|
1255
|
-
process.exit(1);
|
|
1256
|
-
}
|
|
1257
|
-
await this.searchModels(args[0]);
|
|
1258
|
-
break;
|
|
1259
|
-
|
|
1260
|
-
case 'downloads': {
|
|
1261
|
-
let podName = null;
|
|
1262
|
-
let live = false;
|
|
1263
|
-
|
|
1264
|
-
// Parse --pod parameter
|
|
1265
|
-
const podIndex = args.indexOf('--pod');
|
|
1266
|
-
if (podIndex !== -1 && args[podIndex + 1]) {
|
|
1267
|
-
podName = args[podIndex + 1];
|
|
1268
|
-
}
|
|
1269
|
-
|
|
1270
|
-
// Parse --live parameter
|
|
1271
|
-
if (args.includes('--live')) {
|
|
1272
|
-
live = true;
|
|
1273
|
-
}
|
|
1274
|
-
|
|
1275
|
-
await this.checkDownloads(podName, live);
|
|
1276
|
-
break;
|
|
1277
|
-
}
|
|
1278
|
-
|
|
1279
|
-
case 'start':
|
|
1280
|
-
await this.handleStart(args);
|
|
1281
|
-
break;
|
|
1282
|
-
|
|
1283
|
-
case 'stop': {
|
|
1284
|
-
let modelName = args[0];
|
|
1285
|
-
let podName = null;
|
|
1286
|
-
|
|
1287
|
-
// Parse --pod parameter
|
|
1288
|
-
const podIndex = args.indexOf('--pod');
|
|
1289
|
-
if (podIndex !== -1 && args[podIndex + 1]) {
|
|
1290
|
-
podName = args[podIndex + 1];
|
|
1291
|
-
// Remove --pod and its value from args
|
|
1292
|
-
args.splice(podIndex, 2);
|
|
1293
|
-
modelName = args[0]; // Update modelName after removing --pod
|
|
1294
|
-
}
|
|
1295
|
-
|
|
1296
|
-
this.stop(modelName, podName);
|
|
1297
|
-
break;
|
|
1298
|
-
}
|
|
1299
|
-
|
|
1300
|
-
case 'logs': {
|
|
1301
|
-
let modelName = args[0];
|
|
1302
|
-
let podName = null;
|
|
1303
|
-
|
|
1304
|
-
// Parse --pod parameter
|
|
1305
|
-
const podIndex = args.indexOf('--pod');
|
|
1306
|
-
if (podIndex !== -1 && args[podIndex + 1]) {
|
|
1307
|
-
podName = args[podIndex + 1];
|
|
1308
|
-
// Remove --pod and its value from args
|
|
1309
|
-
args.splice(podIndex, 2);
|
|
1310
|
-
modelName = args[0]; // Update modelName after removing --pod
|
|
1311
|
-
}
|
|
1312
|
-
|
|
1313
|
-
await this.logs(modelName, false, podName); // autoExit = false for manual logs command
|
|
1314
|
-
break;
|
|
1315
|
-
}
|
|
1316
|
-
|
|
1317
|
-
case 'prompt': {
|
|
1318
|
-
if (args.length < 2) {
|
|
1319
|
-
console.error('Usage: pi prompt <model_name> "<message>" [--pod <pod-name>]');
|
|
1320
|
-
console.error('Example: pi prompt phi3 "Hey, how you going"');
|
|
1321
|
-
process.exit(1);
|
|
1322
|
-
}
|
|
1323
|
-
let modelName = args[0];
|
|
1324
|
-
let podName = null;
|
|
1325
|
-
|
|
1326
|
-
// Parse --pod parameter
|
|
1327
|
-
const podIndex = args.indexOf('--pod');
|
|
1328
|
-
if (podIndex !== -1 && args[podIndex + 1]) {
|
|
1329
|
-
podName = args[podIndex + 1];
|
|
1330
|
-
// Remove --pod and its value from args
|
|
1331
|
-
args.splice(podIndex, 2);
|
|
1332
|
-
}
|
|
1333
|
-
|
|
1334
|
-
const message = args.slice(1).join(' ');
|
|
1335
|
-
this.prompt(modelName, message, podName);
|
|
1336
|
-
break;
|
|
1337
|
-
}
|
|
1338
|
-
case 'shell': {
|
|
1339
|
-
let podName = null;
|
|
1340
|
-
|
|
1341
|
-
// Parse --pod parameter
|
|
1342
|
-
const podIndex = args.indexOf('--pod');
|
|
1343
|
-
if (podIndex !== -1 && args[podIndex + 1]) {
|
|
1344
|
-
podName = args[podIndex + 1];
|
|
1345
|
-
}
|
|
1346
|
-
|
|
1347
|
-
await this.shell(podName);
|
|
1348
|
-
break;
|
|
1349
|
-
}
|
|
1350
|
-
|
|
1351
|
-
case 'ssh': {
|
|
1352
|
-
let podName = null;
|
|
1353
|
-
let sshArgs = [...args];
|
|
1354
|
-
|
|
1355
|
-
// For ssh, --pod must be the first parameter if present
|
|
1356
|
-
if (args[0] === '--pod' && args[1]) {
|
|
1357
|
-
podName = args[1];
|
|
1358
|
-
sshArgs = args.slice(2); // Remove --pod and podName from args
|
|
1359
|
-
}
|
|
1360
|
-
|
|
1361
|
-
// Pass through any SSH command
|
|
1362
|
-
if (sshArgs.length > 0) {
|
|
1363
|
-
const output = this.ssh(sshArgs.join(' '), false, false, podName);
|
|
1364
|
-
console.log(output);
|
|
1365
|
-
} else {
|
|
1366
|
-
await this.shell(podName);
|
|
1367
|
-
}
|
|
1368
|
-
break;
|
|
1369
|
-
}
|
|
1370
|
-
|
|
1371
|
-
default:
|
|
1372
|
-
this.showHelp();
|
|
1373
|
-
}
|
|
1374
|
-
}
|
|
1375
|
-
}
|
|
1376
|
-
|
|
1377
|
-
// Run CLI
|
|
1378
|
-
const cli = new PiCli();
|
|
1379
|
-
cli.run().catch(console.error);
|