@draht/pods 2026.3.2-2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +511 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +346 -0
- package/dist/cli.js.map +1 -0
- package/dist/commands/models.d.ts +39 -0
- package/dist/commands/models.d.ts.map +1 -0
- package/dist/commands/models.js +658 -0
- package/dist/commands/models.js.map +1 -0
- package/dist/commands/pods.d.ts +21 -0
- package/dist/commands/pods.d.ts.map +1 -0
- package/dist/commands/pods.js +175 -0
- package/dist/commands/pods.js.map +1 -0
- package/dist/commands/prompt.d.ts +7 -0
- package/dist/commands/prompt.d.ts.map +1 -0
- package/dist/commands/prompt.js +54 -0
- package/dist/commands/prompt.js.map +1 -0
- package/dist/config.d.ts +11 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +74 -0
- package/dist/config.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -0
- package/dist/model-configs.d.ts +22 -0
- package/dist/model-configs.d.ts.map +1 -0
- package/dist/model-configs.js +75 -0
- package/dist/model-configs.js.map +1 -0
- package/dist/models.json +295 -0
- package/dist/scripts/model_run.sh +83 -0
- package/dist/scripts/pod_setup.sh +336 -0
- package/dist/ssh.d.ts +24 -0
- package/dist/ssh.d.ts.map +1 -0
- package/dist/ssh.js +115 -0
- package/dist/ssh.js.map +1 -0
- package/dist/types.d.ts +23 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/package.json +40 -0
- package/scripts/model_run.sh +83 -0
- package/scripts/pod_setup.sh +336 -0
package/README.md
ADDED
|
@@ -0,0 +1,511 @@
|
|
|
1
|
+
# pi
|
|
2
|
+
|
|
3
|
+
Deploy and manage LLMs on GPU pods with automatic vLLM configuration for agentic workloads.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install -g @mariozechner/pi
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## What is pi?
|
|
12
|
+
|
|
13
|
+
`pi` simplifies running large language models on remote GPU pods. It automatically:
|
|
14
|
+
- Sets up vLLM on fresh Ubuntu pods
|
|
15
|
+
- Configures tool calling for agentic models (Qwen, GPT-OSS, GLM, etc.)
|
|
16
|
+
- Manages multiple models on the same pod with "smart" GPU allocation
|
|
17
|
+
- Provides OpenAI-compatible API endpoints for each model
|
|
18
|
+
- Includes an interactive agent with file system tools for testing
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# Set required environment variables
|
|
24
|
+
export HF_TOKEN=your_huggingface_token # Get from https://huggingface.co/settings/tokens
|
|
25
|
+
export PI_API_KEY=your_api_key # Any string you want for API authentication
|
|
26
|
+
|
|
27
|
+
# Setup a DataCrunch pod with NFS storage (models path auto-extracted)
|
|
28
|
+
pi pods setup dc1 "ssh root@1.2.3.4" \
|
|
29
|
+
--mount "sudo mount -t nfs -o nconnect=16 nfs.fin-02.datacrunch.io:/your-pseudo /mnt/hf-models"
|
|
30
|
+
|
|
31
|
+
# Start a model (automatic configuration for known models)
|
|
32
|
+
pi start Qwen/Qwen2.5-Coder-32B-Instruct --name qwen
|
|
33
|
+
|
|
34
|
+
# Send a single message to the model
|
|
35
|
+
pi agent qwen "What is the Fibonacci sequence?"
|
|
36
|
+
|
|
37
|
+
# Interactive chat mode with file system tools
|
|
38
|
+
pi agent qwen -i
|
|
39
|
+
|
|
40
|
+
# Use with any OpenAI-compatible client
|
|
41
|
+
export OPENAI_BASE_URL='http://1.2.3.4:8001/v1'
|
|
42
|
+
export OPENAI_API_KEY=$PI_API_KEY
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Prerequisites
|
|
46
|
+
|
|
47
|
+
- Node.js 18+
|
|
48
|
+
- HuggingFace token (for model downloads)
|
|
49
|
+
- GPU pod with:
|
|
50
|
+
- Ubuntu 22.04 or 24.04
|
|
51
|
+
- SSH root access
|
|
52
|
+
- NVIDIA drivers installed
|
|
53
|
+
- Persistent storage for models
|
|
54
|
+
|
|
55
|
+
## Supported Providers
|
|
56
|
+
|
|
57
|
+
### Primary Support
|
|
58
|
+
|
|
59
|
+
**DataCrunch** - Best for shared model storage
|
|
60
|
+
- NFS volumes sharable across multiple pods in same region
|
|
61
|
+
- Models download once, use everywhere
|
|
62
|
+
- Ideal for teams or multiple experiments
|
|
63
|
+
|
|
64
|
+
**RunPod** - Good persistent storage
|
|
65
|
+
- Network volumes persist independently
|
|
66
|
+
- Cannot share between running pods simultaneously
|
|
67
|
+
- Good for single-pod workflows
|
|
68
|
+
|
|
69
|
+
### Also Works With
|
|
70
|
+
- Vast.ai (volumes locked to specific machine)
|
|
71
|
+
- Prime Intellect (no persistent storage)
|
|
72
|
+
- AWS EC2 (with EFS setup)
|
|
73
|
+
- Any Ubuntu machine with NVIDIA GPUs, CUDA driver, and SSH
|
|
74
|
+
|
|
75
|
+
## Commands
|
|
76
|
+
|
|
77
|
+
### Pod Management
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pi pods setup <name> "<ssh>" [options] # Setup new pod
|
|
81
|
+
--mount "<mount_command>" # Run mount command during setup
|
|
82
|
+
--models-path <path> # Override extracted path (optional)
|
|
83
|
+
--vllm release|nightly|gpt-oss # vLLM version (default: release)
|
|
84
|
+
|
|
85
|
+
pi pods # List all configured pods
|
|
86
|
+
pi pods active <name> # Switch active pod
|
|
87
|
+
pi pods remove <name> # Remove pod from local config
|
|
88
|
+
pi shell [<name>] # SSH into pod
|
|
89
|
+
pi ssh [<name>] "<command>" # Run command on pod
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
**Note**: When using `--mount`, the models path is automatically extracted from the mount command's target directory. You only need `--models-path` if not using `--mount` or to override the extracted path.
|
|
93
|
+
|
|
94
|
+
#### vLLM Version Options
|
|
95
|
+
|
|
96
|
+
- `release` (default): Stable vLLM release, recommended for most users
|
|
97
|
+
- `nightly`: Latest vLLM features, needed for newest models like GLM-4.5
|
|
98
|
+
- `gpt-oss`: Special build for OpenAI's GPT-OSS models only
|
|
99
|
+
|
|
100
|
+
### Model Management
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
pi start <model> --name <name> [options] # Start a model
|
|
104
|
+
--memory <percent> # GPU memory: 30%, 50%, 90% (default: 90%)
|
|
105
|
+
--context <size> # Context window: 4k, 8k, 16k, 32k, 64k, 128k
|
|
106
|
+
--gpus <count> # Number of GPUs to use (predefined models only)
|
|
107
|
+
--pod <name> # Target specific pod (overrides active)
|
|
108
|
+
--vllm <args...> # Pass custom args directly to vLLM
|
|
109
|
+
|
|
110
|
+
pi stop [<name>] # Stop model (or all if no name given)
|
|
111
|
+
pi list # List running models with status
|
|
112
|
+
pi logs <name> # Stream model logs (tail -f)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Agent & Chat Interface
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
pi agent <name> "<message>" # Single message to model
|
|
119
|
+
pi agent <name> "<msg1>" "<msg2>" # Multiple messages in sequence
|
|
120
|
+
pi agent <name> -i # Interactive chat mode
|
|
121
|
+
pi agent <name> -i -c # Continue previous session
|
|
122
|
+
|
|
123
|
+
# Standalone OpenAI-compatible agent (works with any API)
|
|
124
|
+
pi-agent --base-url http://localhost:8000/v1 --model llama-3.1 "Hello"
|
|
125
|
+
pi-agent --api-key sk-... "What is 2+2?" # Uses OpenAI by default
|
|
126
|
+
pi-agent --json "What is 2+2?" # Output event stream as JSONL
|
|
127
|
+
pi-agent -i # Interactive mode
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
The agent includes tools for file operations (read, list, bash, glob, rg) to test agentic capabilities, particularly useful for code navigation and analysis tasks.
|
|
131
|
+
|
|
132
|
+
## Predefined Model Configurations
|
|
133
|
+
|
|
134
|
+
`pi` includes predefined configurations for popular agentic models, so you do not have to specify `--vllm` arguments manually. `pi` will also check if the model you selected can actually run on your pod with respect to the number of GPUs and available VRAM. Run `pi start` without additional arguments to see a list of predefined models that can run on the active pod.
|
|
135
|
+
|
|
136
|
+
### Qwen Models
|
|
137
|
+
```bash
|
|
138
|
+
# Qwen2.5-Coder-32B - Excellent coding model, fits on single H100/H200
|
|
139
|
+
pi start Qwen/Qwen2.5-Coder-32B-Instruct --name qwen
|
|
140
|
+
|
|
141
|
+
# Qwen3-Coder-30B - Advanced reasoning with tool use
|
|
142
|
+
pi start Qwen/Qwen3-Coder-30B-A3B-Instruct --name qwen3
|
|
143
|
+
|
|
144
|
+
# Qwen3-Coder-480B - State-of-the-art on 8xH200 (data-parallel mode)
|
|
145
|
+
pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-480b
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### GPT-OSS Models
|
|
149
|
+
```bash
|
|
150
|
+
# Requires special vLLM build during setup
|
|
151
|
+
pi pods setup gpt-pod "ssh root@1.2.3.4" --models-path /workspace --vllm gpt-oss
|
|
152
|
+
|
|
153
|
+
# GPT-OSS-20B - Fits on 16GB+ VRAM
|
|
154
|
+
pi start openai/gpt-oss-20b --name gpt20
|
|
155
|
+
|
|
156
|
+
# GPT-OSS-120B - Needs 60GB+ VRAM
|
|
157
|
+
pi start openai/gpt-oss-120b --name gpt120
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### GLM Models
|
|
161
|
+
```bash
|
|
162
|
+
# GLM-4.5 - Requires 8-16 GPUs, includes thinking mode
|
|
163
|
+
pi start zai-org/GLM-4.5 --name glm
|
|
164
|
+
|
|
165
|
+
# GLM-4.5-Air - Smaller version, 1-2 GPUs
|
|
166
|
+
pi start zai-org/GLM-4.5-Air --name glm-air
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Custom Models with --vllm
|
|
170
|
+
|
|
171
|
+
For models not in the predefined list, use `--vllm` to pass arguments directly to vLLM:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
# DeepSeek with custom settings
|
|
175
|
+
pi start deepseek-ai/DeepSeek-V3 --name deepseek --vllm \
|
|
176
|
+
--tensor-parallel-size 4 --trust-remote-code
|
|
177
|
+
|
|
178
|
+
# Mistral with pipeline parallelism
|
|
179
|
+
pi start mistralai/Mixtral-8x22B-Instruct-v0.1 --name mixtral --vllm \
|
|
180
|
+
--tensor-parallel-size 8 --pipeline-parallel-size 2
|
|
181
|
+
|
|
182
|
+
# Any model with specific tool parser
|
|
183
|
+
pi start some/model --name mymodel --vllm \
|
|
184
|
+
--tool-call-parser hermes --enable-auto-tool-choice
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## DataCrunch Setup
|
|
188
|
+
|
|
189
|
+
DataCrunch offers the best experience with shared NFS storage across pods:
|
|
190
|
+
|
|
191
|
+
### 1. Create Shared Filesystem (SFS)
|
|
192
|
+
- Go to DataCrunch dashboard → Storage → Create SFS
|
|
193
|
+
- Choose size and datacenter
|
|
194
|
+
- Note the mount command (e.g., `sudo mount -t nfs -o nconnect=16 nfs.fin-02.datacrunch.io:/hf-models-fin02-8ac1bab7 /mnt/hf-models-fin02`)
|
|
195
|
+
|
|
196
|
+
### 2. Create GPU Instance
|
|
197
|
+
- Create instance in same datacenter as SFS
|
|
198
|
+
- Share the SFS with the instance
|
|
199
|
+
- Get SSH command from dashboard
|
|
200
|
+
|
|
201
|
+
### 3. Setup with pi
|
|
202
|
+
```bash
|
|
203
|
+
# Get mount command from DataCrunch dashboard
|
|
204
|
+
pi pods setup dc1 "ssh root@instance.datacrunch.io" \
|
|
205
|
+
--mount "sudo mount -t nfs -o nconnect=16 nfs.fin-02.datacrunch.io:/your-pseudo /mnt/hf-models"
|
|
206
|
+
|
|
207
|
+
# Models automatically stored in /mnt/hf-models (extracted from mount command)
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
### 4. Benefits
|
|
211
|
+
- Models persist across instance restarts
|
|
212
|
+
- Share models between multiple instances in same datacenter
|
|
213
|
+
- Download once, use everywhere
|
|
214
|
+
- Pay only for storage, not compute time during downloads
|
|
215
|
+
|
|
216
|
+
## RunPod Setup
|
|
217
|
+
|
|
218
|
+
RunPod offers good persistent storage with network volumes:
|
|
219
|
+
|
|
220
|
+
### 1. Create Network Volume (optional)
|
|
221
|
+
- Go to RunPod dashboard → Storage → Create Network Volume
|
|
222
|
+
- Choose size and region
|
|
223
|
+
|
|
224
|
+
### 2. Create GPU Pod
|
|
225
|
+
- Select "Network Volume" during pod creation (if using)
|
|
226
|
+
- Attach your volume to `/runpod-volume`
|
|
227
|
+
- Get SSH command from pod details
|
|
228
|
+
|
|
229
|
+
### 3. Setup with pi
|
|
230
|
+
```bash
|
|
231
|
+
# With network volume
|
|
232
|
+
pi pods setup runpod "ssh root@pod.runpod.io" --models-path /runpod-volume
|
|
233
|
+
|
|
234
|
+
# Or use workspace (persists with pod but not shareable)
|
|
235
|
+
pi pods setup runpod "ssh root@pod.runpod.io" --models-path /workspace
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
## Multi-GPU Support
|
|
240
|
+
|
|
241
|
+
### Automatic GPU Assignment
|
|
242
|
+
When running multiple models, pi automatically assigns them to different GPUs:
|
|
243
|
+
```bash
|
|
244
|
+
pi start model1 --name m1 # Auto-assigns to GPU 0
|
|
245
|
+
pi start model2 --name m2 # Auto-assigns to GPU 1
|
|
246
|
+
pi start model3 --name m3 # Auto-assigns to GPU 2
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### Specify GPU Count for Predefined Models
|
|
250
|
+
For predefined models with multiple configurations, use `--gpus` to control GPU usage:
|
|
251
|
+
```bash
|
|
252
|
+
# Run Qwen on 1 GPU instead of all available
|
|
253
|
+
pi start Qwen/Qwen2.5-Coder-32B-Instruct --name qwen --gpus 1
|
|
254
|
+
|
|
255
|
+
# Run GLM-4.5 on 8 GPUs (if it has an 8-GPU config)
|
|
256
|
+
pi start zai-org/GLM-4.5 --name glm --gpus 8
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
If the model doesn't have a configuration for the requested GPU count, you'll see available options.
|
|
260
|
+
|
|
261
|
+
### Tensor Parallelism for Large Models
|
|
262
|
+
For models that don't fit on a single GPU:
|
|
263
|
+
```bash
|
|
264
|
+
# Use all available GPUs
|
|
265
|
+
pi start meta-llama/Llama-3.1-70B-Instruct --name llama70b --vllm \
|
|
266
|
+
--tensor-parallel-size 4
|
|
267
|
+
|
|
268
|
+
# Specific GPU count
|
|
269
|
+
pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen480 --vllm \
|
|
270
|
+
--data-parallel-size 8 --enable-expert-parallel
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
## API Integration
|
|
274
|
+
|
|
275
|
+
All models expose OpenAI-compatible endpoints:
|
|
276
|
+
|
|
277
|
+
```python
|
|
278
|
+
from openai import OpenAI
|
|
279
|
+
|
|
280
|
+
client = OpenAI(
|
|
281
|
+
base_url="http://your-pod-ip:8001/v1",
|
|
282
|
+
api_key="your-pi-api-key"
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
# Chat completion with tool calling
|
|
286
|
+
response = client.chat.completions.create(
|
|
287
|
+
model="Qwen/Qwen2.5-Coder-32B-Instruct",
|
|
288
|
+
messages=[
|
|
289
|
+
{"role": "user", "content": "Write a Python function to calculate fibonacci"}
|
|
290
|
+
],
|
|
291
|
+
tools=[{
|
|
292
|
+
"type": "function",
|
|
293
|
+
"function": {
|
|
294
|
+
"name": "execute_code",
|
|
295
|
+
"description": "Execute Python code",
|
|
296
|
+
"parameters": {
|
|
297
|
+
"type": "object",
|
|
298
|
+
"properties": {
|
|
299
|
+
"code": {"type": "string"}
|
|
300
|
+
},
|
|
301
|
+
"required": ["code"]
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
}],
|
|
305
|
+
tool_choice="auto"
|
|
306
|
+
)
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
## Standalone Agent CLI
|
|
310
|
+
|
|
311
|
+
`pi` includes a standalone OpenAI-compatible agent that can work with any API:
|
|
312
|
+
|
|
313
|
+
```bash
|
|
314
|
+
# Install globally to get pi-agent command
|
|
315
|
+
npm install -g @mariozechner/pi
|
|
316
|
+
|
|
317
|
+
# Use with OpenAI
|
|
318
|
+
pi-agent --api-key sk-... "What is machine learning?"
|
|
319
|
+
|
|
320
|
+
# Use with local vLLM
|
|
321
|
+
pi-agent --base-url http://localhost:8000/v1 \
|
|
322
|
+
--model meta-llama/Llama-3.1-8B-Instruct \
|
|
323
|
+
--api-key dummy \
|
|
324
|
+
"Explain quantum computing"
|
|
325
|
+
|
|
326
|
+
# Interactive mode
|
|
327
|
+
pi-agent -i
|
|
328
|
+
|
|
329
|
+
# Continue previous session
|
|
330
|
+
pi-agent --continue "Follow up question"
|
|
331
|
+
|
|
332
|
+
# Custom system prompt
|
|
333
|
+
pi-agent --system-prompt "You are a Python expert" "Write a web scraper"
|
|
334
|
+
|
|
335
|
+
# Use responses API (for GPT-OSS models)
|
|
336
|
+
pi-agent --api responses --model openai/gpt-oss-20b "Hello"
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
The agent supports:
|
|
340
|
+
- Session persistence across conversations
|
|
341
|
+
- Interactive TUI mode with syntax highlighting
|
|
342
|
+
- File system tools (read, list, bash, glob, rg) for code navigation
|
|
343
|
+
- Both Chat Completions and Responses API formats
|
|
344
|
+
- Custom system prompts
|
|
345
|
+
|
|
346
|
+
## Tool Calling Support
|
|
347
|
+
|
|
348
|
+
`pi` automatically configures appropriate tool calling parsers for known models:
|
|
349
|
+
|
|
350
|
+
- **Qwen models**: `hermes` parser (Qwen3-Coder uses `qwen3_coder`)
|
|
351
|
+
- **GLM models**: `glm4_moe` parser with reasoning support
|
|
352
|
+
- **GPT-OSS models**: Uses `/v1/responses` endpoint, as tool calling (function calling in OpenAI parlance) is currently a [WIP with the `v1/chat/completions` endpoint](https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html#tool-use).
|
|
353
|
+
- **Custom models**: Specify with `--vllm --tool-call-parser <parser> --enable-auto-tool-choice`
|
|
354
|
+
|
|
355
|
+
To disable tool calling:
|
|
356
|
+
```bash
|
|
357
|
+
pi start model --name mymodel --vllm --disable-tool-call-parser
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
## Memory and Context Management
|
|
361
|
+
|
|
362
|
+
### GPU Memory Allocation
|
|
363
|
+
Controls how much GPU memory vLLM pre-allocates:
|
|
364
|
+
- `--memory 30%`: High concurrency, limited context
|
|
365
|
+
- `--memory 50%`: Balanced (default)
|
|
366
|
+
- `--memory 90%`: Maximum context, low concurrency
|
|
367
|
+
|
|
368
|
+
### Context Window
|
|
369
|
+
Sets maximum input + output tokens:
|
|
370
|
+
- `--context 4k`: 4,096 tokens total
|
|
371
|
+
- `--context 32k`: 32,768 tokens total
|
|
372
|
+
- `--context 128k`: 131,072 tokens total
|
|
373
|
+
|
|
374
|
+
Example for coding workload:
|
|
375
|
+
```bash
|
|
376
|
+
# Large context for code analysis, moderate concurrency
|
|
377
|
+
pi start Qwen/Qwen2.5-Coder-32B-Instruct --name coder \
|
|
378
|
+
--context 64k --memory 70%
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
**Note**: When using `--vllm`, the `--memory`, `--context`, and `--gpus` parameters are ignored. You'll see a warning if you try to use them together.
|
|
382
|
+
|
|
383
|
+
## Session Persistence
|
|
384
|
+
|
|
385
|
+
The interactive agent mode (`-i`) saves sessions for each project directory:
|
|
386
|
+
|
|
387
|
+
```bash
|
|
388
|
+
# Start new session
|
|
389
|
+
pi agent qwen -i
|
|
390
|
+
|
|
391
|
+
# Continue previous session (maintains chat history)
|
|
392
|
+
pi agent qwen -i -c
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
Sessions are stored in `~/.pi/sessions/` organized by project path and include:
|
|
396
|
+
- Complete conversation history
|
|
397
|
+
- Tool call results
|
|
398
|
+
- Token usage statistics
|
|
399
|
+
|
|
400
|
+
## Architecture & Event System
|
|
401
|
+
|
|
402
|
+
The agent uses a unified event-based architecture where all interactions flow through `AgentEvent` types. This enables:
|
|
403
|
+
- Consistent UI rendering across console and TUI modes
|
|
404
|
+
- Session recording and replay
|
|
405
|
+
- Clean separation between API calls and UI updates
|
|
406
|
+
- JSON output mode for programmatic integration
|
|
407
|
+
|
|
408
|
+
Events are automatically converted to the appropriate API format (Chat Completions or Responses) based on the model type.
|
|
409
|
+
|
|
410
|
+
### JSON Output Mode
|
|
411
|
+
|
|
412
|
+
Use `--json` flag to output the event stream as JSONL (JSON Lines) for programmatic consumption:
|
|
413
|
+
```bash
|
|
414
|
+
pi-agent --api-key sk-... --json "What is 2+2?"
|
|
415
|
+
```
|
|
416
|
+
|
|
417
|
+
Each line is a complete JSON object representing an event:
|
|
418
|
+
```jsonl
|
|
419
|
+
{"type":"user_message","text":"What is 2+2?"}
|
|
420
|
+
{"type":"assistant_start"}
|
|
421
|
+
{"type":"assistant_message","text":"2 + 2 = 4"}
|
|
422
|
+
{"type":"token_usage","inputTokens":10,"outputTokens":5,"totalTokens":15,"cacheReadTokens":0,"cacheWriteTokens":0}
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
## Troubleshooting
|
|
426
|
+
|
|
427
|
+
### OOM (Out of Memory) Errors
|
|
428
|
+
- Reduce `--memory` percentage
|
|
429
|
+
- Use smaller model or quantized version (FP8)
|
|
430
|
+
- Reduce `--context` size
|
|
431
|
+
|
|
432
|
+
### Model Won't Start
|
|
433
|
+
```bash
|
|
434
|
+
# Check GPU usage
|
|
435
|
+
pi ssh "nvidia-smi"
|
|
436
|
+
|
|
437
|
+
# Check if port is in use
|
|
438
|
+
pi list
|
|
439
|
+
|
|
440
|
+
# Force stop all models
|
|
441
|
+
pi stop
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
### Tool Calling Issues
|
|
445
|
+
- Not all models support tool calling reliably
|
|
446
|
+
- Try different parser: `--vllm --tool-call-parser mistral`
|
|
447
|
+
- Or disable: `--vllm --disable-tool-call-parser`
|
|
448
|
+
|
|
449
|
+
### Access Denied for Models
|
|
450
|
+
Some models (Llama, Mistral) require HuggingFace access approval. Visit the model page and click "Request access".
|
|
451
|
+
|
|
452
|
+
### vLLM Build Issues
|
|
453
|
+
If using `--vllm nightly` fails, try:
|
|
454
|
+
- Use `--vllm release` for stable version
|
|
455
|
+
- Check CUDA compatibility with `pi ssh "nvidia-smi"`
|
|
456
|
+
|
|
457
|
+
### Agent Not Finding Messages
|
|
458
|
+
If the agent shows configuration instead of your message, ensure quotes around messages with special characters:
|
|
459
|
+
```bash
|
|
460
|
+
# Good
|
|
461
|
+
pi agent qwen "What is this file about?"
|
|
462
|
+
|
|
463
|
+
# Bad (shell might interpret special chars)
|
|
464
|
+
pi agent qwen What is this file about?
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
## Advanced Usage
|
|
468
|
+
|
|
469
|
+
### Working with Multiple Pods
|
|
470
|
+
```bash
|
|
471
|
+
# Override active pod for any command
|
|
472
|
+
pi start model --name test --pod dev-pod
|
|
473
|
+
pi list --pod prod-pod
|
|
474
|
+
pi stop test --pod dev-pod
|
|
475
|
+
```
|
|
476
|
+
|
|
477
|
+
### Custom vLLM Arguments
|
|
478
|
+
```bash
|
|
479
|
+
# Pass any vLLM argument after --vllm
|
|
480
|
+
pi start model --name custom --vllm \
|
|
481
|
+
--quantization awq \
|
|
482
|
+
--enable-prefix-caching \
|
|
483
|
+
--max-num-seqs 256 \
|
|
484
|
+
--gpu-memory-utilization 0.95
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
### Monitoring
|
|
488
|
+
```bash
|
|
489
|
+
# Watch GPU utilization
|
|
490
|
+
pi ssh "watch -n 1 nvidia-smi"
|
|
491
|
+
|
|
492
|
+
# Check model downloads
|
|
493
|
+
pi ssh "du -sh ~/.cache/huggingface/hub/*"
|
|
494
|
+
|
|
495
|
+
# View all logs
|
|
496
|
+
pi ssh "ls -la ~/.vllm_logs/"
|
|
497
|
+
|
|
498
|
+
# Check agent session history
|
|
499
|
+
ls -la ~/.pi/sessions/
|
|
500
|
+
```
|
|
501
|
+
|
|
502
|
+
## Environment Variables
|
|
503
|
+
|
|
504
|
+
- `HF_TOKEN` - HuggingFace token for model downloads
|
|
505
|
+
- `PI_API_KEY` - API key for vLLM endpoints
|
|
506
|
+
- `PI_CONFIG_DIR` - Config directory (default: `~/.pi`)
|
|
507
|
+
- `OPENAI_API_KEY` - Used by `pi-agent` when no `--api-key` provided
|
|
508
|
+
|
|
509
|
+
## License
|
|
510
|
+
|
|
511
|
+
MIT
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":"","sourcesContent":["#!/usr/bin/env node\nimport chalk from \"chalk\";\nimport { spawn } from \"child_process\";\nimport { readFileSync } from \"fs\";\nimport { dirname, join } from \"path\";\nimport { fileURLToPath } from \"url\";\nimport { listModels, showKnownModels, startModel, stopAllModels, stopModel, viewLogs } from \"./commands/models.js\";\nimport { listPods, removePodCommand, setupPod, switchActivePod } from \"./commands/pods.js\";\nimport { promptModel } from \"./commands/prompt.js\";\nimport { getActivePod, loadConfig } from \"./config.js\";\nimport { sshExecStream } from \"./ssh.js\";\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = dirname(__filename);\n\nconst packageJson = JSON.parse(readFileSync(join(__dirname, \"../package.json\"), \"utf-8\"));\n\nfunction printHelp() {\n\tconsole.log(`pi v${packageJson.version} - Manage vLLM deployments on GPU pods\n\nPod Management:\n pi pods setup <name> \"<ssh>\" --mount \"<mount>\" Setup pod with mount command\n Options:\n --vllm release Install latest vLLM release >=0.10.0 (default)\n --vllm nightly Install vLLM nightly build (latest features)\n --vllm gpt-oss Install vLLM 0.10.1+gptoss with PyTorch nightly (GPT-OSS only)\n pi pods List all pods (* = active)\n pi pods active <name> Switch active pod\n pi pods remove <name> Remove pod from local config\n pi shell [<name>] Open shell on pod (active or specified)\n pi ssh [<name>] \"<command>\" Run SSH command on pod\n\nModel Management:\n pi start <model> --name <name> [options] Start a model\n --memory <percent> GPU memory allocation (30%, 50%, 90%)\n --context <size> Context window (4k, 8k, 16k, 32k, 64k, 128k)\n --gpus <count> Number of GPUs to use (predefined models only)\n --vllm <args...> Pass remaining args to vLLM (ignores other options)\n pi stop [<name>] Stop model (or all if no name)\n pi list List running models\n pi logs <name> Stream model logs\n pi agent <name> [\"<message>\"...] [options] Chat with model using agent & tools\n pi agent <name> [options] Interactive chat mode\n --continue, -c Continue previous session\n --json Output as JSONL\n (All pi-agent options are supported)\n\n All model commands support --pod <name> to override the active pod.\n\nEnvironment:\n HF_TOKEN HuggingFace token for model downloads\n PI_API_KEY API key for vLLM endpoints\n PI_CONFIG_DIR Config directory (default: ~/.pi)`);\n}\n\n// Parse command line arguments\nconst args = process.argv.slice(2);\n\nif (args.length === 0 || args[0] === \"--help\" || args[0] === \"-h\") {\n\tprintHelp();\n\tprocess.exit(0);\n}\n\nif (args[0] === \"--version\" || args[0] === \"-v\") {\n\tconsole.log(packageJson.version);\n\tprocess.exit(0);\n}\n\nconst command = args[0];\nconst subcommand = args[1];\n\n// Main command handler\ntry {\n\t// Handle \"pi pods\" commands\n\tif (command === \"pods\") {\n\t\tif (!subcommand) {\n\t\t\t// pi pods - list all pods\n\t\t\tlistPods();\n\t\t} else if (subcommand === \"setup\") {\n\t\t\t// pi pods setup <name> \"<ssh>\" [--mount \"<mount>\"] [--models-path <path>] [--vllm release|nightly|gpt-oss]\n\t\t\tconst name = args[2];\n\t\t\tconst sshCmd = args[3];\n\n\t\t\tif (!name || !sshCmd) {\n\t\t\t\tconsole.error(\n\t\t\t\t\t'Usage: pi pods setup <name> \"<ssh>\" [--mount \"<mount>\"] [--models-path <path>] [--vllm release|nightly|gpt-oss]',\n\t\t\t\t);\n\t\t\t\tprocess.exit(1);\n\t\t\t}\n\n\t\t\t// Parse options\n\t\t\tconst options: { mount?: string; modelsPath?: string; vllm?: \"release\" | \"nightly\" | \"gpt-oss\" } = {};\n\t\t\tfor (let i = 4; i < args.length; i++) {\n\t\t\t\tif (args[i] === \"--mount\" && i + 1 < args.length) {\n\t\t\t\t\toptions.mount = args[i + 1];\n\t\t\t\t\ti++;\n\t\t\t\t} else if (args[i] === \"--models-path\" && i + 1 < args.length) {\n\t\t\t\t\toptions.modelsPath = args[i + 1];\n\t\t\t\t\ti++;\n\t\t\t\t} else if (args[i] === \"--vllm\" && i + 1 < args.length) {\n\t\t\t\t\tconst vllmType = args[i + 1];\n\t\t\t\t\tif (vllmType === \"release\" || vllmType === \"nightly\" || vllmType === \"gpt-oss\") {\n\t\t\t\t\t\toptions.vllm = vllmType;\n\t\t\t\t\t} else {\n\t\t\t\t\t\tconsole.error(chalk.red(`Invalid vLLM type: ${vllmType}`));\n\t\t\t\t\t\tconsole.error(\"Valid options: release, nightly, gpt-oss\");\n\t\t\t\t\t\tprocess.exit(1);\n\t\t\t\t\t}\n\t\t\t\t\ti++;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t// If --mount provided but no --models-path, try to extract path from mount command\n\t\t\tif (options.mount && !options.modelsPath) {\n\t\t\t\t// Extract last part of mount command as models path\n\t\t\t\tconst parts = options.mount.trim().split(\" \");\n\t\t\t\tconst lastPart = parts[parts.length - 1];\n\t\t\t\tif (lastPart?.startsWith(\"/\")) {\n\t\t\t\t\toptions.modelsPath = lastPart;\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tawait setupPod(name, sshCmd, options);\n\t\t} else if (subcommand === \"active\") {\n\t\t\t// pi pods active <name>\n\t\t\tconst name = args[2];\n\t\t\tif (!name) {\n\t\t\t\tconsole.error(\"Usage: pi pods active <name>\");\n\t\t\t\tprocess.exit(1);\n\t\t\t}\n\t\t\tswitchActivePod(name);\n\t\t} else if (subcommand === \"remove\") {\n\t\t\t// pi pods remove <name>\n\t\t\tconst name = args[2];\n\t\t\tif (!name) {\n\t\t\t\tconsole.error(\"Usage: pi pods remove <name>\");\n\t\t\t\tprocess.exit(1);\n\t\t\t}\n\t\t\tremovePodCommand(name);\n\t\t} else {\n\t\t\tconsole.error(`Unknown pods subcommand: ${subcommand}`);\n\t\t\tprocess.exit(1);\n\t\t}\n\t} else {\n\t\t// Parse --pod override for model commands\n\t\tlet podOverride: string | undefined;\n\t\tconst podIndex = args.indexOf(\"--pod\");\n\t\tif (podIndex !== -1 && podIndex + 1 < args.length) {\n\t\t\tpodOverride = args[podIndex + 1];\n\t\t\t// Remove --pod and its value from args\n\t\t\targs.splice(podIndex, 2);\n\t\t}\n\n\t\t// Handle SSH/shell commands and model commands\n\t\tswitch (command) {\n\t\t\tcase \"shell\": {\n\t\t\t\t// pi shell [<name>] - open interactive shell\n\t\t\t\tconst podName = args[1];\n\t\t\t\tlet podInfo: { name: string; pod: import(\"./types.js\").Pod } | null = null;\n\n\t\t\t\tif (podName) {\n\t\t\t\t\tconst config = loadConfig();\n\t\t\t\t\tconst pod = config.pods[podName];\n\t\t\t\t\tif (pod) {\n\t\t\t\t\t\tpodInfo = { name: podName, pod };\n\t\t\t\t\t}\n\t\t\t\t} else {\n\t\t\t\t\tpodInfo = getActivePod();\n\t\t\t\t}\n\n\t\t\t\tif (!podInfo) {\n\t\t\t\t\tif (podName) {\n\t\t\t\t\t\tconsole.error(chalk.red(`Pod '${podName}' not found`));\n\t\t\t\t\t} else {\n\t\t\t\t\t\tconsole.error(chalk.red(\"No active pod. Use 'pi pods active <name>' to set one.\"));\n\t\t\t\t\t}\n\t\t\t\t\tprocess.exit(1);\n\t\t\t\t}\n\n\t\t\t\tconsole.log(chalk.green(`Connecting to pod '${podInfo.name}'...`));\n\n\t\t\t\t// Execute SSH in interactive mode\n\t\t\t\tconst sshArgs = podInfo.pod.ssh.split(\" \").slice(1); // Remove 'ssh' from command\n\t\t\t\tconst sshProcess = spawn(\"ssh\", sshArgs, {\n\t\t\t\t\tstdio: \"inherit\",\n\t\t\t\t\tenv: process.env,\n\t\t\t\t});\n\n\t\t\t\tsshProcess.on(\"exit\", (code) => {\n\t\t\t\t\tprocess.exit(code || 0);\n\t\t\t\t});\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tcase \"ssh\": {\n\t\t\t\t// pi ssh [<name>] \"<command>\" - run command via SSH\n\t\t\t\tlet podName: string | undefined;\n\t\t\t\tlet sshCommand: string;\n\n\t\t\t\tif (args.length === 2) {\n\t\t\t\t\t// pi ssh \"<command>\" - use active pod\n\t\t\t\t\tsshCommand = args[1];\n\t\t\t\t} else if (args.length === 3) {\n\t\t\t\t\t// pi ssh <name> \"<command>\"\n\t\t\t\t\tpodName = args[1];\n\t\t\t\t\tsshCommand = args[2];\n\t\t\t\t} else {\n\t\t\t\t\tconsole.error('Usage: pi ssh [<name>] \"<command>\"');\n\t\t\t\t\tprocess.exit(1);\n\t\t\t\t}\n\n\t\t\t\tlet podInfo: { name: string; pod: import(\"./types.js\").Pod } | null = null;\n\n\t\t\t\tif (podName) {\n\t\t\t\t\tconst config = loadConfig();\n\t\t\t\t\tconst pod = config.pods[podName];\n\t\t\t\t\tif (pod) {\n\t\t\t\t\t\tpodInfo = { name: podName, pod };\n\t\t\t\t\t}\n\t\t\t\t} else {\n\t\t\t\t\tpodInfo = getActivePod();\n\t\t\t\t}\n\n\t\t\t\tif (!podInfo) {\n\t\t\t\t\tif (podName) {\n\t\t\t\t\t\tconsole.error(chalk.red(`Pod '${podName}' not found`));\n\t\t\t\t\t} else {\n\t\t\t\t\t\tconsole.error(chalk.red(\"No active pod. Use 'pi pods active <name>' to set one.\"));\n\t\t\t\t\t}\n\t\t\t\t\tprocess.exit(1);\n\t\t\t\t}\n\n\t\t\t\tconsole.log(chalk.gray(`Running on pod '${podInfo.name}': ${sshCommand}`));\n\n\t\t\t\t// Execute command and stream output\n\t\t\t\tconst exitCode = await sshExecStream(podInfo.pod.ssh, sshCommand);\n\t\t\t\tprocess.exit(exitCode);\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tcase \"start\": {\n\t\t\t\t// pi start <model> --name <name> [options]\n\t\t\t\tconst modelId = args[1];\n\t\t\t\tif (!modelId) {\n\t\t\t\t\t// Show available models\n\t\t\t\t\tawait showKnownModels();\n\t\t\t\t\tprocess.exit(0);\n\t\t\t\t}\n\n\t\t\t\t// Parse options\n\t\t\t\tlet name: string | undefined;\n\t\t\t\tlet memory: string | undefined;\n\t\t\t\tlet context: string | undefined;\n\t\t\t\tlet gpus: number | undefined;\n\t\t\t\tconst vllmArgs: string[] = [];\n\t\t\t\tlet inVllmArgs = false;\n\n\t\t\t\tfor (let i = 2; i < args.length; i++) {\n\t\t\t\t\tif (inVllmArgs) {\n\t\t\t\t\t\tvllmArgs.push(args[i]);\n\t\t\t\t\t} else if (args[i] === \"--name\" && i + 1 < args.length) {\n\t\t\t\t\t\tname = args[i + 1];\n\t\t\t\t\t\ti++;\n\t\t\t\t\t} else if (args[i] === \"--memory\" && i + 1 < args.length) {\n\t\t\t\t\t\tmemory = args[i + 1];\n\t\t\t\t\t\ti++;\n\t\t\t\t\t} else if (args[i] === \"--context\" && i + 1 < args.length) {\n\t\t\t\t\t\tcontext = args[i + 1];\n\t\t\t\t\t\ti++;\n\t\t\t\t\t} else if (args[i] === \"--gpus\" && i + 1 < args.length) {\n\t\t\t\t\t\tgpus = parseInt(args[i + 1], 10);\n\t\t\t\t\t\tif (Number.isNaN(gpus) || gpus < 1) {\n\t\t\t\t\t\t\tconsole.error(chalk.red(\"--gpus must be a positive number\"));\n\t\t\t\t\t\t\tprocess.exit(1);\n\t\t\t\t\t\t}\n\t\t\t\t\t\ti++;\n\t\t\t\t\t} else if (args[i] === \"--vllm\") {\n\t\t\t\t\t\tinVllmArgs = true;\n\t\t\t\t\t}\n\t\t\t\t}\n\n\t\t\t\tif (!name) {\n\t\t\t\t\tconsole.error(\"--name is required\");\n\t\t\t\t\tprocess.exit(1);\n\t\t\t\t}\n\n\t\t\t\t// Warn if --vllm is used with other parameters\n\t\t\t\tif (vllmArgs.length > 0 && (memory || context || gpus)) {\n\t\t\t\t\tconsole.log(\n\t\t\t\t\t\tchalk.yellow(\"⚠ Warning: --memory, --context, and --gpus are ignored when --vllm is specified\"),\n\t\t\t\t\t);\n\t\t\t\t\tconsole.log(chalk.yellow(\" Using only custom vLLM arguments\"));\n\t\t\t\t\tconsole.log(\"\");\n\t\t\t\t}\n\n\t\t\t\tawait startModel(modelId, name, {\n\t\t\t\t\tpod: podOverride,\n\t\t\t\t\tmemory,\n\t\t\t\t\tcontext,\n\t\t\t\t\tgpus,\n\t\t\t\t\tvllmArgs: vllmArgs.length > 0 ? vllmArgs : undefined,\n\t\t\t\t});\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tcase \"stop\": {\n\t\t\t\t// pi stop [name] - stop specific model or all models\n\t\t\t\tconst name = args[1];\n\t\t\t\tif (!name) {\n\t\t\t\t\t// Stop all models on the active pod\n\t\t\t\t\tawait stopAllModels({ pod: podOverride });\n\t\t\t\t} else {\n\t\t\t\t\tawait stopModel(name, { pod: podOverride });\n\t\t\t\t}\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tcase \"list\":\n\t\t\t\t// pi list\n\t\t\t\tawait listModels({ pod: podOverride });\n\t\t\t\tbreak;\n\t\t\tcase \"logs\": {\n\t\t\t\t// pi logs <name>\n\t\t\t\tconst name = args[1];\n\t\t\t\tif (!name) {\n\t\t\t\t\tconsole.error(\"Usage: pi logs <name>\");\n\t\t\t\t\tprocess.exit(1);\n\t\t\t\t}\n\t\t\t\tawait viewLogs(name, { pod: podOverride });\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tcase \"agent\": {\n\t\t\t\t// pi agent <name> [messages...] [options]\n\t\t\t\tconst name = args[1];\n\t\t\t\tif (!name) {\n\t\t\t\t\tconsole.error(\"Usage: pi agent <name> [messages...] [options]\");\n\t\t\t\t\tprocess.exit(1);\n\t\t\t\t}\n\n\t\t\t\tconst apiKey = process.env.PI_API_KEY;\n\n\t\t\t\t// Pass all args after the model name\n\t\t\t\tconst agentArgs = args.slice(2);\n\n\t\t\t\t// If no messages provided, it's interactive mode\n\t\t\t\tawait promptModel(name, agentArgs, {\n\t\t\t\t\tpod: podOverride,\n\t\t\t\t\tapiKey,\n\t\t\t\t}).catch(() => {\n\t\t\t\t\t// Error already handled in promptModel, just exit cleanly\n\t\t\t\t\tprocess.exit(0);\n\t\t\t\t});\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tdefault:\n\t\t\t\tconsole.error(`Unknown command: ${command}`);\n\t\t\t\tprintHelp();\n\t\t\t\tprocess.exit(1);\n\t\t}\n\t}\n} catch (error) {\n\tconsole.error(\"Error:\", error);\n\tprocess.exit(1);\n}\n"]}
|