@draht/pods 2026.3.2-2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +511 -0
  2. package/dist/cli.d.ts +3 -0
  3. package/dist/cli.d.ts.map +1 -0
  4. package/dist/cli.js +346 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/commands/models.d.ts +39 -0
  7. package/dist/commands/models.d.ts.map +1 -0
  8. package/dist/commands/models.js +658 -0
  9. package/dist/commands/models.js.map +1 -0
  10. package/dist/commands/pods.d.ts +21 -0
  11. package/dist/commands/pods.d.ts.map +1 -0
  12. package/dist/commands/pods.js +175 -0
  13. package/dist/commands/pods.js.map +1 -0
  14. package/dist/commands/prompt.d.ts +7 -0
  15. package/dist/commands/prompt.d.ts.map +1 -0
  16. package/dist/commands/prompt.js +54 -0
  17. package/dist/commands/prompt.js.map +1 -0
  18. package/dist/config.d.ts +11 -0
  19. package/dist/config.d.ts.map +1 -0
  20. package/dist/config.js +74 -0
  21. package/dist/config.js.map +1 -0
  22. package/dist/index.d.ts +2 -0
  23. package/dist/index.d.ts.map +1 -0
  24. package/dist/index.js +3 -0
  25. package/dist/index.js.map +1 -0
  26. package/dist/model-configs.d.ts +22 -0
  27. package/dist/model-configs.d.ts.map +1 -0
  28. package/dist/model-configs.js +75 -0
  29. package/dist/model-configs.js.map +1 -0
  30. package/dist/models.json +295 -0
  31. package/dist/scripts/model_run.sh +83 -0
  32. package/dist/scripts/pod_setup.sh +336 -0
  33. package/dist/ssh.d.ts +24 -0
  34. package/dist/ssh.d.ts.map +1 -0
  35. package/dist/ssh.js +115 -0
  36. package/dist/ssh.js.map +1 -0
  37. package/dist/types.d.ts +23 -0
  38. package/dist/types.d.ts.map +1 -0
  39. package/dist/types.js +3 -0
  40. package/dist/types.js.map +1 -0
  41. package/package.json +40 -0
  42. package/scripts/model_run.sh +83 -0
  43. package/scripts/pod_setup.sh +336 -0
package/README.md ADDED
@@ -0,0 +1,511 @@
1
+ # pi
2
+
3
+ Deploy and manage LLMs on GPU pods with automatic vLLM configuration for agentic workloads.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install -g @mariozechner/pi
9
+ ```
10
+
11
+ ## What is pi?
12
+
13
+ `pi` simplifies running large language models on remote GPU pods. It automatically:
14
+ - Sets up vLLM on fresh Ubuntu pods
15
+ - Configures tool calling for agentic models (Qwen, GPT-OSS, GLM, etc.)
16
+ - Manages multiple models on the same pod with "smart" GPU allocation
17
+ - Provides OpenAI-compatible API endpoints for each model
18
+ - Includes an interactive agent with file system tools for testing
19
+
20
+ ## Quick Start
21
+
22
+ ```bash
23
+ # Set required environment variables
24
+ export HF_TOKEN=your_huggingface_token # Get from https://huggingface.co/settings/tokens
25
+ export PI_API_KEY=your_api_key # Any string you want for API authentication
26
+
27
+ # Setup a DataCrunch pod with NFS storage (models path auto-extracted)
28
+ pi pods setup dc1 "ssh root@1.2.3.4" \
29
+ --mount "sudo mount -t nfs -o nconnect=16 nfs.fin-02.datacrunch.io:/your-pseudo /mnt/hf-models"
30
+
31
+ # Start a model (automatic configuration for known models)
32
+ pi start Qwen/Qwen2.5-Coder-32B-Instruct --name qwen
33
+
34
+ # Send a single message to the model
35
+ pi agent qwen "What is the Fibonacci sequence?"
36
+
37
+ # Interactive chat mode with file system tools
38
+ pi agent qwen -i
39
+
40
+ # Use with any OpenAI-compatible client
41
+ export OPENAI_BASE_URL='http://1.2.3.4:8001/v1'
42
+ export OPENAI_API_KEY=$PI_API_KEY
43
+ ```
44
+
45
+ ## Prerequisites
46
+
47
+ - Node.js 18+
48
+ - HuggingFace token (for model downloads)
49
+ - GPU pod with:
50
+ - Ubuntu 22.04 or 24.04
51
+ - SSH root access
52
+ - NVIDIA drivers installed
53
+ - Persistent storage for models
54
+
55
+ ## Supported Providers
56
+
57
+ ### Primary Support
58
+
59
+ **DataCrunch** - Best for shared model storage
60
+ - NFS volumes sharable across multiple pods in same region
61
+ - Models download once, use everywhere
62
+ - Ideal for teams or multiple experiments
63
+
64
+ **RunPod** - Good persistent storage
65
+ - Network volumes persist independently
66
+ - Cannot share between running pods simultaneously
67
+ - Good for single-pod workflows
68
+
69
+ ### Also Works With
70
+ - Vast.ai (volumes locked to specific machine)
71
+ - Prime Intellect (no persistent storage)
72
+ - AWS EC2 (with EFS setup)
73
+ - Any Ubuntu machine with NVIDIA GPUs, CUDA driver, and SSH
74
+
75
+ ## Commands
76
+
77
+ ### Pod Management
78
+
79
+ ```bash
80
+ pi pods setup <name> "<ssh>" [options] # Setup new pod
81
+ --mount "<mount_command>" # Run mount command during setup
82
+ --models-path <path> # Override extracted path (optional)
83
+ --vllm release|nightly|gpt-oss # vLLM version (default: release)
84
+
85
+ pi pods # List all configured pods
86
+ pi pods active <name> # Switch active pod
87
+ pi pods remove <name> # Remove pod from local config
88
+ pi shell [<name>] # SSH into pod
89
+ pi ssh [<name>] "<command>" # Run command on pod
90
+ ```
91
+
92
+ **Note**: When using `--mount`, the models path is automatically extracted from the mount command's target directory. You only need `--models-path` if not using `--mount` or to override the extracted path.
93
+
94
+ #### vLLM Version Options
95
+
96
+ - `release` (default): Stable vLLM release, recommended for most users
97
+ - `nightly`: Latest vLLM features, needed for newest models like GLM-4.5
98
+ - `gpt-oss`: Special build for OpenAI's GPT-OSS models only
99
+
100
+ ### Model Management
101
+
102
+ ```bash
103
+ pi start <model> --name <name> [options] # Start a model
104
+ --memory <percent> # GPU memory: 30%, 50%, 90% (default: 90%)
105
+ --context <size> # Context window: 4k, 8k, 16k, 32k, 64k, 128k
106
+ --gpus <count> # Number of GPUs to use (predefined models only)
107
+ --pod <name> # Target specific pod (overrides active)
108
+ --vllm <args...> # Pass custom args directly to vLLM
109
+
110
+ pi stop [<name>] # Stop model (or all if no name given)
111
+ pi list # List running models with status
112
+ pi logs <name> # Stream model logs (tail -f)
113
+ ```
114
+
115
+ ### Agent & Chat Interface
116
+
117
+ ```bash
118
+ pi agent <name> "<message>" # Single message to model
119
+ pi agent <name> "<msg1>" "<msg2>" # Multiple messages in sequence
120
+ pi agent <name> -i # Interactive chat mode
121
+ pi agent <name> -i -c # Continue previous session
122
+
123
+ # Standalone OpenAI-compatible agent (works with any API)
124
+ pi-agent --base-url http://localhost:8000/v1 --model llama-3.1 "Hello"
125
+ pi-agent --api-key sk-... "What is 2+2?" # Uses OpenAI by default
126
+ pi-agent --json "What is 2+2?" # Output event stream as JSONL
127
+ pi-agent -i # Interactive mode
128
+ ```
129
+
130
+ The agent includes tools for file operations (read, list, bash, glob, rg) to test agentic capabilities, particularly useful for code navigation and analysis tasks.
131
+
132
+ ## Predefined Model Configurations
133
+
134
+ `pi` includes predefined configurations for popular agentic models, so you do not have to specify `--vllm` arguments manually. `pi` will also check if the model you selected can actually run on your pod with respect to the number of GPUs and available VRAM. Run `pi start` without additional arguments to see a list of predefined models that can run on the active pod.
135
+
136
+ ### Qwen Models
137
+ ```bash
138
+ # Qwen2.5-Coder-32B - Excellent coding model, fits on single H100/H200
139
+ pi start Qwen/Qwen2.5-Coder-32B-Instruct --name qwen
140
+
141
+ # Qwen3-Coder-30B - Advanced reasoning with tool use
142
+ pi start Qwen/Qwen3-Coder-30B-A3B-Instruct --name qwen3
143
+
144
+ # Qwen3-Coder-480B - State-of-the-art on 8xH200 (data-parallel mode)
145
+ pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-480b
146
+ ```
147
+
148
+ ### GPT-OSS Models
149
+ ```bash
150
+ # Requires special vLLM build during setup
151
+ pi pods setup gpt-pod "ssh root@1.2.3.4" --models-path /workspace --vllm gpt-oss
152
+
153
+ # GPT-OSS-20B - Fits on 16GB+ VRAM
154
+ pi start openai/gpt-oss-20b --name gpt20
155
+
156
+ # GPT-OSS-120B - Needs 60GB+ VRAM
157
+ pi start openai/gpt-oss-120b --name gpt120
158
+ ```
159
+
160
+ ### GLM Models
161
+ ```bash
162
+ # GLM-4.5 - Requires 8-16 GPUs, includes thinking mode
163
+ pi start zai-org/GLM-4.5 --name glm
164
+
165
+ # GLM-4.5-Air - Smaller version, 1-2 GPUs
166
+ pi start zai-org/GLM-4.5-Air --name glm-air
167
+ ```
168
+
169
+ ### Custom Models with --vllm
170
+
171
+ For models not in the predefined list, use `--vllm` to pass arguments directly to vLLM:
172
+
173
+ ```bash
174
+ # DeepSeek with custom settings
175
+ pi start deepseek-ai/DeepSeek-V3 --name deepseek --vllm \
176
+ --tensor-parallel-size 4 --trust-remote-code
177
+
178
+ # Mistral with pipeline parallelism
179
+ pi start mistralai/Mixtral-8x22B-Instruct-v0.1 --name mixtral --vllm \
180
+ --tensor-parallel-size 8 --pipeline-parallel-size 2
181
+
182
+ # Any model with specific tool parser
183
+ pi start some/model --name mymodel --vllm \
184
+ --tool-call-parser hermes --enable-auto-tool-choice
185
+ ```
186
+
187
+ ## DataCrunch Setup
188
+
189
+ DataCrunch offers the best experience with shared NFS storage across pods:
190
+
191
+ ### 1. Create Shared Filesystem (SFS)
192
+ - Go to DataCrunch dashboard → Storage → Create SFS
193
+ - Choose size and datacenter
194
+ - Note the mount command (e.g., `sudo mount -t nfs -o nconnect=16 nfs.fin-02.datacrunch.io:/hf-models-fin02-8ac1bab7 /mnt/hf-models-fin02`)
195
+
196
+ ### 2. Create GPU Instance
197
+ - Create instance in same datacenter as SFS
198
+ - Share the SFS with the instance
199
+ - Get SSH command from dashboard
200
+
201
+ ### 3. Setup with pi
202
+ ```bash
203
+ # Get mount command from DataCrunch dashboard
204
+ pi pods setup dc1 "ssh root@instance.datacrunch.io" \
205
+ --mount "sudo mount -t nfs -o nconnect=16 nfs.fin-02.datacrunch.io:/your-pseudo /mnt/hf-models"
206
+
207
+ # Models automatically stored in /mnt/hf-models (extracted from mount command)
208
+ ```
209
+
210
+ ### 4. Benefits
211
+ - Models persist across instance restarts
212
+ - Share models between multiple instances in same datacenter
213
+ - Download once, use everywhere
214
+ - Pay only for storage, not compute time during downloads
215
+
216
+ ## RunPod Setup
217
+
218
+ RunPod offers good persistent storage with network volumes:
219
+
220
+ ### 1. Create Network Volume (optional)
221
+ - Go to RunPod dashboard → Storage → Create Network Volume
222
+ - Choose size and region
223
+
224
+ ### 2. Create GPU Pod
225
+ - Select "Network Volume" during pod creation (if using)
226
+ - Attach your volume to `/runpod-volume`
227
+ - Get SSH command from pod details
228
+
229
+ ### 3. Setup with pi
230
+ ```bash
231
+ # With network volume
232
+ pi pods setup runpod "ssh root@pod.runpod.io" --models-path /runpod-volume
233
+
234
+ # Or use workspace (persists with pod but not shareable)
235
+ pi pods setup runpod "ssh root@pod.runpod.io" --models-path /workspace
236
+ ```
237
+
238
+
239
+ ## Multi-GPU Support
240
+
241
+ ### Automatic GPU Assignment
242
+ When running multiple models, pi automatically assigns them to different GPUs:
243
+ ```bash
244
+ pi start model1 --name m1 # Auto-assigns to GPU 0
245
+ pi start model2 --name m2 # Auto-assigns to GPU 1
246
+ pi start model3 --name m3 # Auto-assigns to GPU 2
247
+ ```
248
+
249
+ ### Specify GPU Count for Predefined Models
250
+ For predefined models with multiple configurations, use `--gpus` to control GPU usage:
251
+ ```bash
252
+ # Run Qwen on 1 GPU instead of all available
253
+ pi start Qwen/Qwen2.5-Coder-32B-Instruct --name qwen --gpus 1
254
+
255
+ # Run GLM-4.5 on 8 GPUs (if it has an 8-GPU config)
256
+ pi start zai-org/GLM-4.5 --name glm --gpus 8
257
+ ```
258
+
259
+ If the model doesn't have a configuration for the requested GPU count, you'll see available options.
260
+
261
+ ### Tensor Parallelism for Large Models
262
+ For models that don't fit on a single GPU:
263
+ ```bash
264
+ # Use all available GPUs
265
+ pi start meta-llama/Llama-3.1-70B-Instruct --name llama70b --vllm \
266
+ --tensor-parallel-size 4
267
+
268
+ # Specific GPU count
269
+ pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen480 --vllm \
270
+ --data-parallel-size 8 --enable-expert-parallel
271
+ ```
272
+
273
+ ## API Integration
274
+
275
+ All models expose OpenAI-compatible endpoints:
276
+
277
+ ```python
278
+ from openai import OpenAI
279
+
280
+ client = OpenAI(
281
+ base_url="http://your-pod-ip:8001/v1",
282
+ api_key="your-pi-api-key"
283
+ )
284
+
285
+ # Chat completion with tool calling
286
+ response = client.chat.completions.create(
287
+ model="Qwen/Qwen2.5-Coder-32B-Instruct",
288
+ messages=[
289
+ {"role": "user", "content": "Write a Python function to calculate fibonacci"}
290
+ ],
291
+ tools=[{
292
+ "type": "function",
293
+ "function": {
294
+ "name": "execute_code",
295
+ "description": "Execute Python code",
296
+ "parameters": {
297
+ "type": "object",
298
+ "properties": {
299
+ "code": {"type": "string"}
300
+ },
301
+ "required": ["code"]
302
+ }
303
+ }
304
+ }],
305
+ tool_choice="auto"
306
+ )
307
+ ```
308
+
309
+ ## Standalone Agent CLI
310
+
311
+ `pi` includes a standalone OpenAI-compatible agent that can work with any API:
312
+
313
+ ```bash
314
+ # Install globally to get pi-agent command
315
+ npm install -g @mariozechner/pi
316
+
317
+ # Use with OpenAI
318
+ pi-agent --api-key sk-... "What is machine learning?"
319
+
320
+ # Use with local vLLM
321
+ pi-agent --base-url http://localhost:8000/v1 \
322
+ --model meta-llama/Llama-3.1-8B-Instruct \
323
+ --api-key dummy \
324
+ "Explain quantum computing"
325
+
326
+ # Interactive mode
327
+ pi-agent -i
328
+
329
+ # Continue previous session
330
+ pi-agent --continue "Follow up question"
331
+
332
+ # Custom system prompt
333
+ pi-agent --system-prompt "You are a Python expert" "Write a web scraper"
334
+
335
+ # Use responses API (for GPT-OSS models)
336
+ pi-agent --api responses --model openai/gpt-oss-20b "Hello"
337
+ ```
338
+
339
+ The agent supports:
340
+ - Session persistence across conversations
341
+ - Interactive TUI mode with syntax highlighting
342
+ - File system tools (read, list, bash, glob, rg) for code navigation
343
+ - Both Chat Completions and Responses API formats
344
+ - Custom system prompts
345
+
346
+ ## Tool Calling Support
347
+
348
+ `pi` automatically configures appropriate tool calling parsers for known models:
349
+
350
+ - **Qwen models**: `hermes` parser (Qwen3-Coder uses `qwen3_coder`)
351
+ - **GLM models**: `glm4_moe` parser with reasoning support
352
+ - **GPT-OSS models**: Uses `/v1/responses` endpoint, as tool calling (function calling in OpenAI parlance) is currently a [WIP with the `v1/chat/completions` endpoint](https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html#tool-use).
353
+ - **Custom models**: Specify with `--vllm --tool-call-parser <parser> --enable-auto-tool-choice`
354
+
355
+ To disable tool calling:
356
+ ```bash
357
+ pi start model --name mymodel --vllm --disable-tool-call-parser
358
+ ```
359
+
360
+ ## Memory and Context Management
361
+
362
+ ### GPU Memory Allocation
363
+ Controls how much GPU memory vLLM pre-allocates:
364
+ - `--memory 30%`: High concurrency, limited context
365
+ - `--memory 50%`: Balanced (default)
366
+ - `--memory 90%`: Maximum context, low concurrency
367
+
368
+ ### Context Window
369
+ Sets maximum input + output tokens:
370
+ - `--context 4k`: 4,096 tokens total
371
+ - `--context 32k`: 32,768 tokens total
372
+ - `--context 128k`: 131,072 tokens total
373
+
374
+ Example for coding workload:
375
+ ```bash
376
+ # Large context for code analysis, moderate concurrency
377
+ pi start Qwen/Qwen2.5-Coder-32B-Instruct --name coder \
378
+ --context 64k --memory 70%
379
+ ```
380
+
381
+ **Note**: When using `--vllm`, the `--memory`, `--context`, and `--gpus` parameters are ignored. You'll see a warning if you try to use them together.
382
+
383
+ ## Session Persistence
384
+
385
+ The interactive agent mode (`-i`) saves sessions for each project directory:
386
+
387
+ ```bash
388
+ # Start new session
389
+ pi agent qwen -i
390
+
391
+ # Continue previous session (maintains chat history)
392
+ pi agent qwen -i -c
393
+ ```
394
+
395
+ Sessions are stored in `~/.pi/sessions/` organized by project path and include:
396
+ - Complete conversation history
397
+ - Tool call results
398
+ - Token usage statistics
399
+
400
+ ## Architecture & Event System
401
+
402
+ The agent uses a unified event-based architecture where all interactions flow through `AgentEvent` types. This enables:
403
+ - Consistent UI rendering across console and TUI modes
404
+ - Session recording and replay
405
+ - Clean separation between API calls and UI updates
406
+ - JSON output mode for programmatic integration
407
+
408
+ Events are automatically converted to the appropriate API format (Chat Completions or Responses) based on the model type.
409
+
410
+ ### JSON Output Mode
411
+
412
+ Use `--json` flag to output the event stream as JSONL (JSON Lines) for programmatic consumption:
413
+ ```bash
414
+ pi-agent --api-key sk-... --json "What is 2+2?"
415
+ ```
416
+
417
+ Each line is a complete JSON object representing an event:
418
+ ```jsonl
419
+ {"type":"user_message","text":"What is 2+2?"}
420
+ {"type":"assistant_start"}
421
+ {"type":"assistant_message","text":"2 + 2 = 4"}
422
+ {"type":"token_usage","inputTokens":10,"outputTokens":5,"totalTokens":15,"cacheReadTokens":0,"cacheWriteTokens":0}
423
+ ```
424
+
425
+ ## Troubleshooting
426
+
427
+ ### OOM (Out of Memory) Errors
428
+ - Reduce `--memory` percentage
429
+ - Use smaller model or quantized version (FP8)
430
+ - Reduce `--context` size
431
+
432
+ ### Model Won't Start
433
+ ```bash
434
+ # Check GPU usage
435
+ pi ssh "nvidia-smi"
436
+
437
+ # Check if port is in use
438
+ pi list
439
+
440
+ # Force stop all models
441
+ pi stop
442
+ ```
443
+
444
+ ### Tool Calling Issues
445
+ - Not all models support tool calling reliably
446
+ - Try different parser: `--vllm --tool-call-parser mistral`
447
+ - Or disable: `--vllm --disable-tool-call-parser`
448
+
449
+ ### Access Denied for Models
450
+ Some models (Llama, Mistral) require HuggingFace access approval. Visit the model page and click "Request access".
451
+
452
+ ### vLLM Build Issues
453
+ If using `--vllm nightly` fails, try:
454
+ - Use `--vllm release` for stable version
455
+ - Check CUDA compatibility with `pi ssh "nvidia-smi"`
456
+
457
+ ### Agent Not Finding Messages
458
+ If the agent shows configuration instead of your message, ensure quotes around messages with special characters:
459
+ ```bash
460
+ # Good
461
+ pi agent qwen "What is this file about?"
462
+
463
+ # Bad (shell might interpret special chars)
464
+ pi agent qwen What is this file about?
465
+ ```
466
+
467
+ ## Advanced Usage
468
+
469
+ ### Working with Multiple Pods
470
+ ```bash
471
+ # Override active pod for any command
472
+ pi start model --name test --pod dev-pod
473
+ pi list --pod prod-pod
474
+ pi stop test --pod dev-pod
475
+ ```
476
+
477
+ ### Custom vLLM Arguments
478
+ ```bash
479
+ # Pass any vLLM argument after --vllm
480
+ pi start model --name custom --vllm \
481
+ --quantization awq \
482
+ --enable-prefix-caching \
483
+ --max-num-seqs 256 \
484
+ --gpu-memory-utilization 0.95
485
+ ```
486
+
487
+ ### Monitoring
488
+ ```bash
489
+ # Watch GPU utilization
490
+ pi ssh "watch -n 1 nvidia-smi"
491
+
492
+ # Check model downloads
493
+ pi ssh "du -sh ~/.cache/huggingface/hub/*"
494
+
495
+ # View all logs
496
+ pi ssh "ls -la ~/.vllm_logs/"
497
+
498
+ # Check agent session history
499
+ ls -la ~/.pi/sessions/
500
+ ```
501
+
502
+ ## Environment Variables
503
+
504
+ - `HF_TOKEN` - HuggingFace token for model downloads
505
+ - `PI_API_KEY` - API key for vLLM endpoints
506
+ - `PI_CONFIG_DIR` - Config directory (default: `~/.pi`)
507
+ - `OPENAI_API_KEY` - Used by `pi-agent` when no `--api-key` provided
508
+
509
+ ## License
510
+
511
+ MIT
package/dist/cli.d.ts ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env node
2
+ export {};
3
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":"","sourcesContent":["#!/usr/bin/env node\nimport chalk from \"chalk\";\nimport { spawn } from \"child_process\";\nimport { readFileSync } from \"fs\";\nimport { dirname, join } from \"path\";\nimport { fileURLToPath } from \"url\";\nimport { listModels, showKnownModels, startModel, stopAllModels, stopModel, viewLogs } from \"./commands/models.js\";\nimport { listPods, removePodCommand, setupPod, switchActivePod } from \"./commands/pods.js\";\nimport { promptModel } from \"./commands/prompt.js\";\nimport { getActivePod, loadConfig } from \"./config.js\";\nimport { sshExecStream } from \"./ssh.js\";\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = dirname(__filename);\n\nconst packageJson = JSON.parse(readFileSync(join(__dirname, \"../package.json\"), \"utf-8\"));\n\nfunction printHelp() {\n\tconsole.log(`pi v${packageJson.version} - Manage vLLM deployments on GPU pods\n\nPod Management:\n pi pods setup <name> \"<ssh>\" --mount \"<mount>\" Setup pod with mount command\n Options:\n --vllm release Install latest vLLM release >=0.10.0 (default)\n --vllm nightly Install vLLM nightly build (latest features)\n --vllm gpt-oss Install vLLM 0.10.1+gptoss with PyTorch nightly (GPT-OSS only)\n pi pods List all pods (* = active)\n pi pods active <name> Switch active pod\n pi pods remove <name> Remove pod from local config\n pi shell [<name>] Open shell on pod (active or specified)\n pi ssh [<name>] \"<command>\" Run SSH command on pod\n\nModel Management:\n pi start <model> --name <name> [options] Start a model\n --memory <percent> GPU memory allocation (30%, 50%, 90%)\n --context <size> Context window (4k, 8k, 16k, 32k, 64k, 128k)\n --gpus <count> Number of GPUs to use (predefined models only)\n --vllm <args...> Pass remaining args to vLLM (ignores other options)\n pi stop [<name>] Stop model (or all if no name)\n pi list List running models\n pi logs <name> Stream model logs\n pi agent <name> [\"<message>\"...] [options] Chat with model using agent & tools\n pi agent <name> [options] Interactive chat mode\n --continue, -c Continue previous session\n --json Output as JSONL\n (All pi-agent options are supported)\n\n All model commands support --pod <name> to override the active pod.\n\nEnvironment:\n HF_TOKEN HuggingFace token for model downloads\n PI_API_KEY API key for vLLM endpoints\n PI_CONFIG_DIR Config directory (default: ~/.pi)`);\n}\n\n// Parse command line arguments\nconst args = process.argv.slice(2);\n\nif (args.length === 0 || args[0] === \"--help\" || args[0] === \"-h\") {\n\tprintHelp();\n\tprocess.exit(0);\n}\n\nif (args[0] === \"--version\" || args[0] === \"-v\") {\n\tconsole.log(packageJson.version);\n\tprocess.exit(0);\n}\n\nconst command = args[0];\nconst subcommand = args[1];\n\n// Main command handler\ntry {\n\t// Handle \"pi pods\" commands\n\tif (command === \"pods\") {\n\t\tif (!subcommand) {\n\t\t\t// pi pods - list all pods\n\t\t\tlistPods();\n\t\t} else if (subcommand === \"setup\") {\n\t\t\t// pi pods setup <name> \"<ssh>\" [--mount \"<mount>\"] [--models-path <path>] [--vllm release|nightly|gpt-oss]\n\t\t\tconst name = args[2];\n\t\t\tconst sshCmd = args[3];\n\n\t\t\tif (!name || !sshCmd) {\n\t\t\t\tconsole.error(\n\t\t\t\t\t'Usage: pi pods setup <name> \"<ssh>\" [--mount \"<mount>\"] [--models-path <path>] [--vllm release|nightly|gpt-oss]',\n\t\t\t\t);\n\t\t\t\tprocess.exit(1);\n\t\t\t}\n\n\t\t\t// Parse options\n\t\t\tconst options: { mount?: string; modelsPath?: string; vllm?: \"release\" | \"nightly\" | \"gpt-oss\" } = {};\n\t\t\tfor (let i = 4; i < args.length; i++) {\n\t\t\t\tif (args[i] === \"--mount\" && i + 1 < args.length) {\n\t\t\t\t\toptions.mount = args[i + 1];\n\t\t\t\t\ti++;\n\t\t\t\t} else if (args[i] === \"--models-path\" && i + 1 < args.length) {\n\t\t\t\t\toptions.modelsPath = args[i + 1];\n\t\t\t\t\ti++;\n\t\t\t\t} else if (args[i] === \"--vllm\" && i + 1 < args.length) {\n\t\t\t\t\tconst vllmType = args[i + 1];\n\t\t\t\t\tif (vllmType === \"release\" || vllmType === \"nightly\" || vllmType === \"gpt-oss\") {\n\t\t\t\t\t\toptions.vllm = vllmType;\n\t\t\t\t\t} else {\n\t\t\t\t\t\tconsole.error(chalk.red(`Invalid vLLM type: ${vllmType}`));\n\t\t\t\t\t\tconsole.error(\"Valid options: release, nightly, gpt-oss\");\n\t\t\t\t\t\tprocess.exit(1);\n\t\t\t\t\t}\n\t\t\t\t\ti++;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t// If --mount provided but no --models-path, try to extract path from mount command\n\t\t\tif (options.mount && !options.modelsPath) {\n\t\t\t\t// Extract last part of mount command as models path\n\t\t\t\tconst parts = options.mount.trim().split(\" \");\n\t\t\t\tconst lastPart = parts[parts.length - 1];\n\t\t\t\tif (lastPart?.startsWith(\"/\")) {\n\t\t\t\t\toptions.modelsPath = lastPart;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tawait setupPod(name, sshCmd, options);\n\t\t} else if (subcommand === \"active\") {\n\t\t\t// pi pods active <name>\n\t\t\tconst name = args[2];\n\t\t\tif (!name) {\n\t\t\t\tconsole.error(\"Usage: pi pods active <name>\");\n\t\t\t\tprocess.exit(1);\n\t\t\t}\n\t\t\tswitchActivePod(name);\n\t\t} else if (subcommand === \"remove\") {\n\t\t\t// pi pods remove <name>\n\t\t\tconst name = args[2];\n\t\t\tif (!name) {\n\t\t\t\tconsole.error(\"Usage: pi pods remove <name>\");\n\t\t\t\tprocess.exit(1);\n\t\t\t}\n\t\t\tremovePodCommand(name);\n\t\t} else {\n\t\t\tconsole.error(`Unknown pods subcommand: ${subcommand}`);\n\t\t\tprocess.exit(1);\n\t\t}\n\t} else {\n\t\t// Parse --pod override for model commands\n\t\tlet podOverride: string | undefined;\n\t\tconst podIndex = args.indexOf(\"--pod\");\n\t\tif (podIndex !== -1 && podIndex + 1 < args.length) {\n\t\t\tpodOverride = args[podIndex + 1];\n\t\t\t// Remove --pod and its value from args\n\t\t\targs.splice(podIndex, 2);\n\t\t}\n\n\t\t// Handle SSH/shell commands and model commands\n\t\tswitch (command) {\n\t\t\tcase \"shell\": {\n\t\t\t\t// pi shell [<name>] - open interactive shell\n\t\t\t\tconst podName = args[1];\n\t\t\t\tlet podInfo: { name: string; pod: import(\"./types.js\").Pod } | null = null;\n\n\t\t\t\tif (podName) {\n\t\t\t\t\tconst config = loadConfig();\n\t\t\t\t\tconst pod = config.pods[podName];\n\t\t\t\t\tif (pod) {\n\t\t\t\t\t\tpodInfo = { name: podName, pod };\n\t\t\t\t\t}\n\t\t\t\t} else {\n\t\t\t\t\tpodInfo = getActivePod();\n\t\t\t\t}\n\n\t\t\t\tif (!podInfo) {\n\t\t\t\t\tif (podName) {\n\t\t\t\t\t\tconsole.error(chalk.red(`Pod '${podName}' not found`));\n\t\t\t\t\t} else {\n\t\t\t\t\t\tconsole.error(chalk.red(\"No active pod. Use 'pi pods active <name>' to set one.\"));\n\t\t\t\t\t}\n\t\t\t\t\tprocess.exit(1);\n\t\t\t\t}\n\n\t\t\t\tconsole.log(chalk.green(`Connecting to pod '${podInfo.name}'...`));\n\n\t\t\t\t// Execute SSH in interactive mode\n\t\t\t\tconst sshArgs = podInfo.pod.ssh.split(\" \").slice(1); // Remove 'ssh' from command\n\t\t\t\tconst sshProcess = spawn(\"ssh\", sshArgs, {\n\t\t\t\t\tstdio: \"inherit\",\n\t\t\t\t\tenv: process.env,\n\t\t\t\t});\n\n\t\t\t\tsshProcess.on(\"exit\", (code) => {\n\t\t\t\t\tprocess.exit(code || 0);\n\t\t\t\t});\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tcase \"ssh\": {\n\t\t\t\t// pi ssh [<name>] \"<command>\" - run command via SSH\n\t\t\t\tlet podName: string | undefined;\n\t\t\t\tlet sshCommand: string;\n\n\t\t\t\tif (args.length === 2) {\n\t\t\t\t\t// pi ssh \"<command>\" - use active pod\n\t\t\t\t\tsshCommand = args[1];\n\t\t\t\t} else if (args.length === 3) {\n\t\t\t\t\t// pi ssh <name> \"<command>\"\n\t\t\t\t\tpodName = args[1];\n\t\t\t\t\tsshCommand = args[2];\n\t\t\t\t} else {\n\t\t\t\t\tconsole.error('Usage: pi ssh [<name>] \"<command>\"');\n\t\t\t\t\tprocess.exit(1);\n\t\t\t\t}\n\n\t\t\t\tlet podInfo: { name: string; pod: import(\"./types.js\").Pod } | null = null;\n\n\t\t\t\tif (podName) {\n\t\t\t\t\tconst config = loadConfig();\n\t\t\t\t\tconst pod = config.pods[podName];\n\t\t\t\t\tif (pod) {\n\t\t\t\t\t\tpodInfo = { name: podName, pod };\n\t\t\t\t\t}\n\t\t\t\t} else {\n\t\t\t\t\tpodInfo = getActivePod();\n\t\t\t\t}\n\n\t\t\t\tif (!podInfo) {\n\t\t\t\t\tif (podName) {\n\t\t\t\t\t\tconsole.error(chalk.red(`Pod '${podName}' not found`));\n\t\t\t\t\t} else {\n\t\t\t\t\t\tconsole.error(chalk.red(\"No active pod. Use 'pi pods active <name>' to set one.\"));\n\t\t\t\t\t}\n\t\t\t\t\tprocess.exit(1);\n\t\t\t\t}\n\n\t\t\t\tconsole.log(chalk.gray(`Running on pod '${podInfo.name}': ${sshCommand}`));\n\n\t\t\t\t// Execute command and stream output\n\t\t\t\tconst exitCode = await sshExecStream(podInfo.pod.ssh, sshCommand);\n\t\t\t\tprocess.exit(exitCode);\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tcase \"start\": {\n\t\t\t\t// pi start <model> --name <name> [options]\n\t\t\t\tconst modelId = args[1];\n\t\t\t\tif (!modelId) {\n\t\t\t\t\t// Show available models\n\t\t\t\t\tawait showKnownModels();\n\t\t\t\t\tprocess.exit(0);\n\t\t\t\t}\n\n\t\t\t\t// Parse options\n\t\t\t\tlet name: string | undefined;\n\t\t\t\tlet memory: string | undefined;\n\t\t\t\tlet context: string | undefined;\n\t\t\t\tlet gpus: number | undefined;\n\t\t\t\tconst vllmArgs: string[] = [];\n\t\t\t\tlet inVllmArgs = false;\n\n\t\t\t\tfor (let i = 2; i < args.length; i++) {\n\t\t\t\t\tif (inVllmArgs) {\n\t\t\t\t\t\tvllmArgs.push(args[i]);\n\t\t\t\t\t} else if (args[i] === \"--name\" && i + 1 < args.length) {\n\t\t\t\t\t\tname = args[i + 1];\n\t\t\t\t\t\ti++;\n\t\t\t\t\t} else if (args[i] === \"--memory\" && i + 1 < args.length) {\n\t\t\t\t\t\tmemory = args[i + 1];\n\t\t\t\t\t\ti++;\n\t\t\t\t\t} else if (args[i] === \"--context\" && i + 1 < args.length) {\n\t\t\t\t\t\tcontext = args[i + 1];\n\t\t\t\t\t\ti++;\n\t\t\t\t\t} else if (args[i] === \"--gpus\" && i + 1 < args.length) {\n\t\t\t\t\t\tgpus = parseInt(args[i + 1], 10);\n\t\t\t\t\t\tif (Number.isNaN(gpus) || gpus < 1) {\n\t\t\t\t\t\t\tconsole.error(chalk.red(\"--gpus must be a positive number\"));\n\t\t\t\t\t\t\tprocess.exit(1);\n\t\t\t\t\t\t}\n\t\t\t\t\t\ti++;\n\t\t\t\t\t} else if (args[i] === \"--vllm\") {\n\t\t\t\t\t\tinVllmArgs = true;\n\t\t\t\t\t}\n\t\t\t\t}\n\n\t\t\t\tif (!name) {\n\t\t\t\t\tconsole.error(\"--name is required\");\n\t\t\t\t\tprocess.exit(1);\n\t\t\t\t}\n\n\t\t\t\t// Warn if --vllm is used with other parameters\n\t\t\t\tif (vllmArgs.length > 0 && (memory || context || gpus)) {\n\t\t\t\t\tconsole.log(\n\t\t\t\t\t\tchalk.yellow(\"⚠ Warning: --memory, --context, and --gpus are ignored when --vllm is specified\"),\n\t\t\t\t\t);\n\t\t\t\t\tconsole.log(chalk.yellow(\" Using only custom vLLM arguments\"));\n\t\t\t\t\tconsole.log(\"\");\n\t\t\t\t}\n\n\t\t\t\tawait startModel(modelId, name, {\n\t\t\t\t\tpod: podOverride,\n\t\t\t\t\tmemory,\n\t\t\t\t\tcontext,\n\t\t\t\t\tgpus,\n\t\t\t\t\tvllmArgs: vllmArgs.length > 0 ? vllmArgs : undefined,\n\t\t\t\t});\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tcase \"stop\": {\n\t\t\t\t// pi stop [name] - stop specific model or all models\n\t\t\t\tconst name = args[1];\n\t\t\t\tif (!name) {\n\t\t\t\t\t// Stop all models on the active pod\n\t\t\t\t\tawait stopAllModels({ pod: podOverride });\n\t\t\t\t} else {\n\t\t\t\t\tawait stopModel(name, { pod: podOverride });\n\t\t\t\t}\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tcase \"list\":\n\t\t\t\t// pi list\n\t\t\t\tawait listModels({ pod: podOverride });\n\t\t\t\tbreak;\n\t\t\tcase \"logs\": {\n\t\t\t\t// pi logs <name>\n\t\t\t\tconst name = args[1];\n\t\t\t\tif (!name) {\n\t\t\t\t\tconsole.error(\"Usage: pi logs <name>\");\n\t\t\t\t\tprocess.exit(1);\n\t\t\t\t}\n\t\t\t\tawait viewLogs(name, { pod: podOverride });\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tcase \"agent\": {\n\t\t\t\t// pi agent <name> [messages...] [options]\n\t\t\t\tconst name = args[1];\n\t\t\t\tif (!name) {\n\t\t\t\t\tconsole.error(\"Usage: pi agent <name> [messages...] [options]\");\n\t\t\t\t\tprocess.exit(1);\n\t\t\t\t}\n\n\t\t\t\tconst apiKey = process.env.PI_API_KEY;\n\n\t\t\t\t// Pass all args after the model name\n\t\t\t\tconst agentArgs = args.slice(2);\n\n\t\t\t\t// If no messages provided, it's interactive mode\n\t\t\t\tawait promptModel(name, agentArgs, {\n\t\t\t\t\tpod: podOverride,\n\t\t\t\t\tapiKey,\n\t\t\t\t}).catch(() => {\n\t\t\t\t\t// Error already handled in promptModel, just exit cleanly\n\t\t\t\t\tprocess.exit(0);\n\t\t\t\t});\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tdefault:\n\t\t\t\tconsole.error(`Unknown command: ${command}`);\n\t\t\t\tprintHelp();\n\t\t\t\tprocess.exit(1);\n\t\t}\n\t}\n} catch (error) {\n\tconsole.error(\"Error:\", error);\n\tprocess.exit(1);\n}\n"]}