voltagegpu-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- volt/__init__.py +6 -0
- volt/cli/__init__.py +3 -0
- volt/cli/cli.py +417 -0
- volt/sdk/__init__.py +7 -0
- volt/sdk/client.py +270 -0
- volt/sdk/config.py +62 -0
- volt/sdk/decorators.py +200 -0
- volt/sdk/exceptions.py +29 -0
- volt/sdk/models.py +132 -0
- volt/sdk/utils.py +94 -0
- voltagegpu_cli-1.0.0.dist-info/METADATA +288 -0
- voltagegpu_cli-1.0.0.dist-info/RECORD +14 -0
- voltagegpu_cli-1.0.0.dist-info/WHEEL +4 -0
- voltagegpu_cli-1.0.0.dist-info/entry_points.txt +2 -0
volt/sdk/client.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""VoltageGPU SDK Client - Main API client for VoltageGPU."""
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from .config import Config
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class Pod:
|
|
12
|
+
"""Represents a GPU pod."""
|
|
13
|
+
id: str
|
|
14
|
+
name: str
|
|
15
|
+
status: str
|
|
16
|
+
gpu_type: str
|
|
17
|
+
gpu_count: int
|
|
18
|
+
hourly_price: float
|
|
19
|
+
ssh_host: Optional[str] = None
|
|
20
|
+
ssh_port: Optional[int] = None
|
|
21
|
+
template_id: Optional[str] = None
|
|
22
|
+
created_at: Optional[str] = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class Template:
|
|
27
|
+
"""Represents a pod template."""
|
|
28
|
+
id: str
|
|
29
|
+
name: str
|
|
30
|
+
description: str
|
|
31
|
+
docker_image: str
|
|
32
|
+
category: Optional[str] = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class SSHKey:
|
|
37
|
+
"""Represents an SSH key."""
|
|
38
|
+
id: str
|
|
39
|
+
name: str
|
|
40
|
+
public_key: str
|
|
41
|
+
fingerprint: Optional[str] = None
|
|
42
|
+
created_at: Optional[str] = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class Machine:
|
|
47
|
+
"""Represents an available GPU machine type."""
|
|
48
|
+
name: str
|
|
49
|
+
hourly_price: float
|
|
50
|
+
total_gpu_count: int
|
|
51
|
+
available: bool
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _as_list(data):
|
|
55
|
+
"""Normalize API response to a list."""
|
|
56
|
+
if isinstance(data, list):
|
|
57
|
+
return data
|
|
58
|
+
if isinstance(data, dict):
|
|
59
|
+
# Try common wrapper keys
|
|
60
|
+
for key in ("pods", "templates", "machines", "sshKeys", "keys", "data"):
|
|
61
|
+
if key in data:
|
|
62
|
+
return data[key]
|
|
63
|
+
return [data]
|
|
64
|
+
return []
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class VoltageGPUClient:
|
|
68
|
+
"""Main client for interacting with VoltageGPU API."""
|
|
69
|
+
|
|
70
|
+
def __init__(self, config: Optional[Config] = None):
|
|
71
|
+
"""Initialize the client with optional config."""
|
|
72
|
+
self.config = config or Config.load()
|
|
73
|
+
self._client = httpx.Client(
|
|
74
|
+
base_url=self.config.base_url,
|
|
75
|
+
headers={
|
|
76
|
+
"X-API-Key": self.config.api_key,
|
|
77
|
+
"Content-Type": "application/json",
|
|
78
|
+
"User-Agent": "VoltageGPU-CLI/1.0.0"
|
|
79
|
+
},
|
|
80
|
+
timeout=30.0
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
def __enter__(self):
|
|
84
|
+
return self
|
|
85
|
+
|
|
86
|
+
def __exit__(self, *args):
|
|
87
|
+
self._client.close()
|
|
88
|
+
|
|
89
|
+
def close(self):
|
|
90
|
+
"""Close the HTTP client."""
|
|
91
|
+
self._client.close()
|
|
92
|
+
|
|
93
|
+
# ==================== PODS ====================
|
|
94
|
+
|
|
95
|
+
def list_pods(self) -> List[Pod]:
|
|
96
|
+
"""List all pods for the current user."""
|
|
97
|
+
response = self._client.get("/volt/pods")
|
|
98
|
+
response.raise_for_status()
|
|
99
|
+
return [self._parse_pod(p) for p in _as_list(response.json())]
|
|
100
|
+
|
|
101
|
+
def get_pod(self, pod_id: str) -> Pod:
|
|
102
|
+
"""Get details of a specific pod."""
|
|
103
|
+
response = self._client.get(f"/volt/pods/{pod_id}")
|
|
104
|
+
response.raise_for_status()
|
|
105
|
+
return self._parse_pod(response.json())
|
|
106
|
+
|
|
107
|
+
def create_pod(
|
|
108
|
+
self,
|
|
109
|
+
template_id: str,
|
|
110
|
+
name: str,
|
|
111
|
+
gpu_count: int = 1,
|
|
112
|
+
ssh_key_ids: Optional[List[str]] = None,
|
|
113
|
+
docker_credentials_id: Optional[str] = None,
|
|
114
|
+
env_vars: Optional[Dict[str, str]] = None
|
|
115
|
+
) -> Pod:
|
|
116
|
+
"""Create a new pod from a template."""
|
|
117
|
+
payload = {
|
|
118
|
+
"templateId": template_id,
|
|
119
|
+
"name": name,
|
|
120
|
+
"gpuCount": gpu_count,
|
|
121
|
+
}
|
|
122
|
+
if ssh_key_ids:
|
|
123
|
+
payload["sshKeyIds"] = ssh_key_ids
|
|
124
|
+
if docker_credentials_id:
|
|
125
|
+
payload["dockerCredentialsId"] = docker_credentials_id
|
|
126
|
+
if env_vars:
|
|
127
|
+
payload["envVars"] = env_vars
|
|
128
|
+
|
|
129
|
+
response = self._client.post("/volt/pods", json=payload)
|
|
130
|
+
response.raise_for_status()
|
|
131
|
+
return self._parse_pod(response.json())
|
|
132
|
+
|
|
133
|
+
def start_pod(self, pod_id: str) -> Pod:
|
|
134
|
+
"""Start a stopped pod."""
|
|
135
|
+
response = self._client.post(f"/volt/pods/{pod_id}/start")
|
|
136
|
+
response.raise_for_status()
|
|
137
|
+
return self._parse_pod(response.json())
|
|
138
|
+
|
|
139
|
+
def stop_pod(self, pod_id: str) -> Pod:
|
|
140
|
+
"""Stop a running pod."""
|
|
141
|
+
response = self._client.post(f"/volt/pods/{pod_id}/stop")
|
|
142
|
+
response.raise_for_status()
|
|
143
|
+
return self._parse_pod(response.json())
|
|
144
|
+
|
|
145
|
+
def delete_pod(self, pod_id: str) -> bool:
|
|
146
|
+
"""Delete a pod."""
|
|
147
|
+
response = self._client.delete(f"/volt/pods/{pod_id}")
|
|
148
|
+
response.raise_for_status()
|
|
149
|
+
return True
|
|
150
|
+
|
|
151
|
+
def _parse_pod(self, data: Dict[str, Any]) -> Pod:
|
|
152
|
+
"""Parse pod data from API response."""
|
|
153
|
+
# Handle both camelCase and snake_case fields from Lium API
|
|
154
|
+
executor = data.get("executor", {}) or {}
|
|
155
|
+
gpu_details = executor.get("specs", {}).get("gpu", {}).get("details", [])
|
|
156
|
+
gpu_name = gpu_details[0].get("name", "") if gpu_details else ""
|
|
157
|
+
gpu_count = executor.get("specs", {}).get("gpu", {}).get("count", 1) if executor else 1
|
|
158
|
+
|
|
159
|
+
return Pod(
|
|
160
|
+
id=data.get("id", data.get("uuid", data.get("podId", ""))),
|
|
161
|
+
name=data.get("name", ""),
|
|
162
|
+
status=data.get("status", "unknown"),
|
|
163
|
+
gpu_type=data.get("gpuType", data.get("gpu_type", gpu_name)),
|
|
164
|
+
gpu_count=data.get("gpuCount", data.get("gpu_count", gpu_count)),
|
|
165
|
+
hourly_price=float(data.get("hourlyPrice", data.get("price_per_hour",
|
|
166
|
+
data.get("hourly_price", 0)))),
|
|
167
|
+
ssh_host=data.get("sshHost", data.get("ssh_host")),
|
|
168
|
+
ssh_port=data.get("sshPort", data.get("ssh_port")),
|
|
169
|
+
template_id=data.get("templateId", data.get("template_id",
|
|
170
|
+
data.get("template", {}).get("id") if isinstance(data.get("template"), dict) else None)),
|
|
171
|
+
created_at=data.get("createdAt", data.get("created_at"))
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# ==================== TEMPLATES ====================
|
|
175
|
+
|
|
176
|
+
def list_templates(self, category: Optional[str] = None) -> List[Template]:
|
|
177
|
+
"""List available templates."""
|
|
178
|
+
params = {}
|
|
179
|
+
if category:
|
|
180
|
+
params["category"] = category
|
|
181
|
+
response = self._client.get("/volt/templates", params=params)
|
|
182
|
+
response.raise_for_status()
|
|
183
|
+
return [self._parse_template(t) for t in _as_list(response.json())]
|
|
184
|
+
|
|
185
|
+
def get_template(self, template_id: str) -> Template:
|
|
186
|
+
"""Get details of a specific template."""
|
|
187
|
+
response = self._client.get(f"/volt/templates/{template_id}")
|
|
188
|
+
response.raise_for_status()
|
|
189
|
+
return self._parse_template(response.json())
|
|
190
|
+
|
|
191
|
+
def _parse_template(self, data: Dict[str, Any]) -> Template:
|
|
192
|
+
"""Parse template data from API response."""
|
|
193
|
+
return Template(
|
|
194
|
+
id=data.get("id", data.get("templateId", "")),
|
|
195
|
+
name=data.get("name", ""),
|
|
196
|
+
description=data.get("description", ""),
|
|
197
|
+
docker_image=data.get("docker_image", data.get("dockerImage", "")),
|
|
198
|
+
category=data.get("category")
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# ==================== SSH KEYS ====================
|
|
202
|
+
|
|
203
|
+
def list_ssh_keys(self) -> List[SSHKey]:
|
|
204
|
+
"""List all SSH keys for the current user."""
|
|
205
|
+
response = self._client.get("/volt/ssh-keys")
|
|
206
|
+
response.raise_for_status()
|
|
207
|
+
return [self._parse_ssh_key(k) for k in _as_list(response.json())]
|
|
208
|
+
|
|
209
|
+
def add_ssh_key(self, name: str, public_key: str) -> SSHKey:
|
|
210
|
+
"""Add a new SSH key."""
|
|
211
|
+
response = self._client.post("/volt/ssh-keys", json={
|
|
212
|
+
"name": name,
|
|
213
|
+
"publicKey": public_key
|
|
214
|
+
})
|
|
215
|
+
response.raise_for_status()
|
|
216
|
+
return self._parse_ssh_key(response.json())
|
|
217
|
+
|
|
218
|
+
def delete_ssh_key(self, key_id: str) -> bool:
|
|
219
|
+
"""Delete an SSH key."""
|
|
220
|
+
response = self._client.delete(f"/volt/ssh-keys/{key_id}")
|
|
221
|
+
response.raise_for_status()
|
|
222
|
+
return True
|
|
223
|
+
|
|
224
|
+
def _parse_ssh_key(self, data: Dict[str, Any]) -> SSHKey:
|
|
225
|
+
"""Parse SSH key data from API response."""
|
|
226
|
+
return SSHKey(
|
|
227
|
+
id=data.get("id", data.get("keyId", "")),
|
|
228
|
+
name=data.get("name", ""),
|
|
229
|
+
public_key=data.get("publicKey", data.get("public_key", "")),
|
|
230
|
+
fingerprint=data.get("fingerprint"),
|
|
231
|
+
created_at=data.get("createdAt", data.get("created_at"))
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# ==================== MACHINES ====================
|
|
235
|
+
|
|
236
|
+
def list_machines(self, gpu_type: Optional[str] = None) -> List[Machine]:
|
|
237
|
+
"""List available machines."""
|
|
238
|
+
response = self._client.get("/volt/machines")
|
|
239
|
+
response.raise_for_status()
|
|
240
|
+
machines = []
|
|
241
|
+
for m in _as_list(response.json()):
|
|
242
|
+
name = m.get("name", "")
|
|
243
|
+
# Flexible match: "RTX4090" matches "NVIDIA GeForce RTX 4090"
|
|
244
|
+
if gpu_type and gpu_type.lower().replace(" ", "") not in name.lower().replace(" ", ""):
|
|
245
|
+
continue
|
|
246
|
+
machines.append(Machine(
|
|
247
|
+
name=name,
|
|
248
|
+
hourly_price=float(m.get("price", 0)),
|
|
249
|
+
total_gpu_count=int(m.get("total_gpu_count", 0)),
|
|
250
|
+
available=int(m.get("total_gpu_count", 0)) > 0
|
|
251
|
+
))
|
|
252
|
+
return machines
|
|
253
|
+
|
|
254
|
+
# ==================== ACCOUNT ====================
|
|
255
|
+
|
|
256
|
+
def get_balance(self) -> float:
|
|
257
|
+
"""Get current account balance."""
|
|
258
|
+
response = self._client.get("/user/balance")
|
|
259
|
+
response.raise_for_status()
|
|
260
|
+
data = response.json()
|
|
261
|
+
return float(data.get("balance", 0))
|
|
262
|
+
|
|
263
|
+
def get_account_info(self) -> Dict[str, Any]:
|
|
264
|
+
"""Get account information."""
|
|
265
|
+
response = self._client.get("/account")
|
|
266
|
+
response.raise_for_status()
|
|
267
|
+
return response.json()
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
__all__ = ["VoltageGPUClient", "Pod", "Template", "SSHKey", "Machine"]
|
volt/sdk/config.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Configuration loading for the VoltageGPU SDK."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class Config:
|
|
11
|
+
api_key: str
|
|
12
|
+
base_url: str = "https://voltagegpu.com/api"
|
|
13
|
+
base_pay_url: str = "https://pay-api.celiumcompute.ai"
|
|
14
|
+
ssh_key_path: Optional[Path] = None
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def load(cls) -> "Config":
|
|
18
|
+
"""Load config from env/file with smart defaults."""
|
|
19
|
+
# Support both VOLT_API_KEY and legacy LIUM_API_KEY for compatibility
|
|
20
|
+
api_key = os.getenv("VOLT_API_KEY") or os.getenv("LIUM_API_KEY")
|
|
21
|
+
if not api_key:
|
|
22
|
+
from configparser import ConfigParser
|
|
23
|
+
# Check VoltageGPU config first, then fallback to legacy
|
|
24
|
+
config_file = Path.home() / ".volt" / "config.ini"
|
|
25
|
+
if not config_file.exists():
|
|
26
|
+
config_file = Path.home() / ".lium" / "config.ini"
|
|
27
|
+
if config_file.exists():
|
|
28
|
+
config = ConfigParser()
|
|
29
|
+
config.read(config_file)
|
|
30
|
+
api_key = config.get("api", "api_key", fallback=None)
|
|
31
|
+
|
|
32
|
+
if not api_key:
|
|
33
|
+
raise ValueError("No API key found. Set VOLT_API_KEY or ~/.volt/config.ini")
|
|
34
|
+
|
|
35
|
+
# Find SSH key with fallback
|
|
36
|
+
ssh_key = None
|
|
37
|
+
for key_name in ["id_ed25519", "id_rsa", "id_ecdsa"]:
|
|
38
|
+
key_path = Path.home() / ".ssh" / key_name
|
|
39
|
+
if key_path.exists():
|
|
40
|
+
ssh_key = key_path
|
|
41
|
+
break
|
|
42
|
+
|
|
43
|
+
return cls(
|
|
44
|
+
api_key=api_key,
|
|
45
|
+
base_url=os.getenv("VOLT_BASE_URL", os.getenv("LIUM_BASE_URL", "https://voltagegpu.com/api")),
|
|
46
|
+
base_pay_url=os.getenv("VOLT_PAY_URL", os.getenv("LIUM_PAY_URL", "https://pay-api.celiumcompute.ai")),
|
|
47
|
+
ssh_key_path=ssh_key
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def ssh_public_keys(self) -> List[str]:
|
|
52
|
+
"""Get SSH public keys."""
|
|
53
|
+
if not self.ssh_key_path:
|
|
54
|
+
return []
|
|
55
|
+
pub_path = self.ssh_key_path.with_suffix('.pub')
|
|
56
|
+
if pub_path.exists():
|
|
57
|
+
with open(pub_path) as f:
|
|
58
|
+
return [line.strip() for line in f if line.strip().startswith(('ssh-', 'ecdsa-'))]
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
__all__ = ["Config"]
|
volt/sdk/decorators.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""Higher-level decorators built on top of the Lium SDK."""
|
|
2
|
+
|
|
3
|
+
import inspect
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import random
|
|
7
|
+
import shlex
|
|
8
|
+
import tempfile
|
|
9
|
+
import time
|
|
10
|
+
from functools import wraps
|
|
11
|
+
from typing import Optional, Sequence
|
|
12
|
+
|
|
13
|
+
from .client import Lium
|
|
14
|
+
from .exceptions import LiumError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def machine(
|
|
18
|
+
machine: str,
|
|
19
|
+
template_id: Optional[str] = None,
|
|
20
|
+
cleanup: bool = True,
|
|
21
|
+
requirements: Optional[Sequence[str]] = None,
|
|
22
|
+
):
|
|
23
|
+
"""Decorator to execute a function on a remote Lium machine.
|
|
24
|
+
|
|
25
|
+
Creates a new pod, sends function source code and executes it remotely,
|
|
26
|
+
returns the result, and optionally cleans up the pod.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
machine: Machine type (e.g., "1xH200", "1xA100")
|
|
30
|
+
template_id: Docker template ID (optional, uses default if not specified)
|
|
31
|
+
cleanup: Whether to delete the pod after execution (default: True)
|
|
32
|
+
requirements: Optional iterable of pip-installable packages to install on the pod
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def decorator(func):
|
|
36
|
+
@wraps(func)
|
|
37
|
+
def wrapper(*args, **kwargs):
|
|
38
|
+
# Initialize SDK
|
|
39
|
+
sdk = Lium()
|
|
40
|
+
pod_info = None
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
# Step 1: Find executor matching machine type
|
|
44
|
+
executors = sdk.ls()
|
|
45
|
+
matching_executor = None
|
|
46
|
+
for executor in executors:
|
|
47
|
+
if machine.lower() in executor.machine_name.lower():
|
|
48
|
+
matching_executor = executor
|
|
49
|
+
break
|
|
50
|
+
|
|
51
|
+
if not matching_executor:
|
|
52
|
+
raise LiumError(f"No executor found matching machine type: {machine}")
|
|
53
|
+
|
|
54
|
+
# Step 2: Create pod
|
|
55
|
+
pod_name = f"remote-{func.__name__}-{int(time.time())}"
|
|
56
|
+
|
|
57
|
+
template = sdk.default_docker_template(matching_executor.id)
|
|
58
|
+
|
|
59
|
+
pod_dict = sdk.up(
|
|
60
|
+
executor_id=matching_executor.id,
|
|
61
|
+
name=pod_name,
|
|
62
|
+
template_id=template.id if template else None,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Wait for pod to be ready
|
|
66
|
+
max_wait = 300
|
|
67
|
+
pod_info = sdk.wait_ready(pod_dict, timeout=max_wait)
|
|
68
|
+
if not pod_info:
|
|
69
|
+
raise LiumError(f"Pod {pod_name} failed to start within {max_wait}s")
|
|
70
|
+
|
|
71
|
+
# Step 3: Extract function source code without decorators
|
|
72
|
+
func_source = inspect.getsource(func)
|
|
73
|
+
func_name = func.__name__
|
|
74
|
+
|
|
75
|
+
# Strip decorator lines - find the 'def' line and keep from there
|
|
76
|
+
lines = func_source.split('\n')
|
|
77
|
+
def_index = next(i for i, line in enumerate(lines) if 'def ' in line)
|
|
78
|
+
func_source = '\n'.join(lines[def_index:])
|
|
79
|
+
|
|
80
|
+
# Step 4: Create runner script with function source and arguments
|
|
81
|
+
runner_script = f'''#!/usr/bin/env python3
|
|
82
|
+
import sys
|
|
83
|
+
import traceback
|
|
84
|
+
import json
|
|
85
|
+
|
|
86
|
+
# Function source code
|
|
87
|
+
{func_source}
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
# Arguments
|
|
91
|
+
args = {repr(args)}
|
|
92
|
+
kwargs = {repr(kwargs)}
|
|
93
|
+
|
|
94
|
+
# Execute function
|
|
95
|
+
result = {func_name}(*args, **kwargs)
|
|
96
|
+
|
|
97
|
+
# Save result as JSON
|
|
98
|
+
with open('/tmp/result.json', 'w') as f:
|
|
99
|
+
json.dump({{'success': True, 'result': result}}, f)
|
|
100
|
+
|
|
101
|
+
except Exception as e:
|
|
102
|
+
# Save error
|
|
103
|
+
with open('/tmp/result.json', 'w') as f:
|
|
104
|
+
json.dump({{
|
|
105
|
+
'success': False,
|
|
106
|
+
'error': str(e),
|
|
107
|
+
'traceback': traceback.format_exc()
|
|
108
|
+
}}, f)
|
|
109
|
+
sys.exit(1)
|
|
110
|
+
'''
|
|
111
|
+
|
|
112
|
+
# Write runner script to temp file
|
|
113
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
|
|
114
|
+
runner_file = f.name
|
|
115
|
+
f.write(runner_script)
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
# Step 5: Upload runner script
|
|
119
|
+
sdk.upload(pod_info, local=runner_file, remote='/tmp/runner.py')
|
|
120
|
+
|
|
121
|
+
# Step 6: Create isolated virtual environment
|
|
122
|
+
venv_path = f"/tmp/lium_venv_{int(time.time())}_{random.randint(1000,9999)}"
|
|
123
|
+
venv_python = f"{venv_path}/bin/python"
|
|
124
|
+
venv_cmd = f"python3 -m venv {shlex.quote(venv_path)}"
|
|
125
|
+
venv_result = sdk.exec(pod_info, command=venv_cmd)
|
|
126
|
+
if not venv_result['success']:
|
|
127
|
+
raise LiumError(f"Failed to create virtual environment:\n{venv_result['stderr']}")
|
|
128
|
+
|
|
129
|
+
# Step 7: Install requirements if requested
|
|
130
|
+
reqs = [req for req in (requirements or []) if req]
|
|
131
|
+
if reqs:
|
|
132
|
+
packages = " ".join(shlex.quote(req) for req in reqs)
|
|
133
|
+
install_cmd = f"{shlex.quote(venv_python)} -m pip install {packages}"
|
|
134
|
+
install_result = sdk.exec(pod_info, command=install_cmd)
|
|
135
|
+
if not install_result['success']:
|
|
136
|
+
raise LiumError(
|
|
137
|
+
"Failed installing requirements "
|
|
138
|
+
f"({', '.join(reqs)}):\n{install_result['stderr']}"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Step 8: Execute runner via virtual environment python
|
|
142
|
+
exec_result = sdk.exec(
|
|
143
|
+
pod_info,
|
|
144
|
+
command=f"{shlex.quote(venv_python)} /tmp/runner.py"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Step 9: Download result (even when execution failed to capture error details)
|
|
148
|
+
result_data = None
|
|
149
|
+
result_file = None
|
|
150
|
+
try:
|
|
151
|
+
with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
|
|
152
|
+
result_file = f.name
|
|
153
|
+
sdk.download(pod_info, remote='/tmp/result.json', local=result_file)
|
|
154
|
+
with open(result_file, 'r') as f:
|
|
155
|
+
result_data = json.load(f)
|
|
156
|
+
except Exception:
|
|
157
|
+
result_data = None
|
|
158
|
+
finally:
|
|
159
|
+
if result_file and os.path.exists(result_file):
|
|
160
|
+
os.unlink(result_file)
|
|
161
|
+
|
|
162
|
+
if result_data and result_data.get('success'):
|
|
163
|
+
return result_data['result']
|
|
164
|
+
|
|
165
|
+
# Construct detailed error message
|
|
166
|
+
if result_data and not result_data.get('success', True):
|
|
167
|
+
err_msg = result_data.get('error', 'Unknown remote error')
|
|
168
|
+
tb = result_data.get('traceback')
|
|
169
|
+
if tb:
|
|
170
|
+
err_msg = f"{err_msg}\n\nTraceback:\n{tb}"
|
|
171
|
+
raise LiumError(f"Remote execution failed:\n{err_msg}")
|
|
172
|
+
|
|
173
|
+
stderr = exec_result.get('stderr') or exec_result.get('stdout') or 'Unknown remote error'
|
|
174
|
+
raise LiumError(f"Remote execution failed:\n{stderr}")
|
|
175
|
+
|
|
176
|
+
finally:
|
|
177
|
+
# Clean up local temp file
|
|
178
|
+
os.unlink(runner_file)
|
|
179
|
+
|
|
180
|
+
finally:
|
|
181
|
+
# Remove virtual environment directory best-effort when pod stays alive
|
|
182
|
+
if pod_info and 'venv_path' in locals():
|
|
183
|
+
try:
|
|
184
|
+
sdk.exec(pod_info, command=f"rm -rf {shlex.quote(venv_path)}")
|
|
185
|
+
except Exception:
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
# Step 10: Cleanup pod
|
|
189
|
+
if cleanup and pod_info:
|
|
190
|
+
try:
|
|
191
|
+
sdk.down(pod_info)
|
|
192
|
+
except Exception:
|
|
193
|
+
pass # Best effort cleanup
|
|
194
|
+
|
|
195
|
+
return wrapper
|
|
196
|
+
|
|
197
|
+
return decorator
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
__all__ = ["machine"]
|
volt/sdk/exceptions.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Exception hierarchy for the Lium SDK."""
|
|
2
|
+
|
|
3
|
+
class LiumError(Exception):
|
|
4
|
+
"""Base exception for Lium SDK."""
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class LiumAuthError(LiumError):
|
|
8
|
+
"""Authentication error."""
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LiumRateLimitError(LiumError):
|
|
12
|
+
"""Rate limit exceeded."""
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LiumServerError(LiumError):
|
|
16
|
+
"""Server error."""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class LiumNotFoundError(LiumError):
|
|
20
|
+
"""Resource not found (404)."""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"LiumError",
|
|
25
|
+
"LiumAuthError",
|
|
26
|
+
"LiumRateLimitError",
|
|
27
|
+
"LiumServerError",
|
|
28
|
+
"LiumNotFoundError",
|
|
29
|
+
]
|
volt/sdk/models.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Datamodels used across the Lium SDK."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import re
|
|
5
|
+
from typing import Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class ExecutorInfo:
|
|
10
|
+
id: str
|
|
11
|
+
huid: str
|
|
12
|
+
machine_name: str
|
|
13
|
+
gpu_type: str
|
|
14
|
+
gpu_count: int
|
|
15
|
+
price_per_hour: float
|
|
16
|
+
price_per_gpu_hour: float
|
|
17
|
+
location: Dict
|
|
18
|
+
specs: Dict
|
|
19
|
+
status: str
|
|
20
|
+
docker_in_docker: bool
|
|
21
|
+
ip: str
|
|
22
|
+
available_port_count: Optional[int] = None
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def driver_version(self) -> str:
|
|
26
|
+
"""Extract GPU driver version from specs."""
|
|
27
|
+
return self.specs.get('gpu', {}).get('driver', '')
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def gpu_model(self) -> str:
|
|
31
|
+
"""Extract GPU model name from specs."""
|
|
32
|
+
gpu_details = self.specs.get('gpu', {}).get('details', [])
|
|
33
|
+
return gpu_details[0].get('name', '') if gpu_details else ''
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class PodInfo:
|
|
38
|
+
id: str
|
|
39
|
+
name: str
|
|
40
|
+
status: str
|
|
41
|
+
huid: str
|
|
42
|
+
ssh_cmd: Optional[str]
|
|
43
|
+
ports: Dict
|
|
44
|
+
created_at: str
|
|
45
|
+
updated_at: str
|
|
46
|
+
executor: Optional[ExecutorInfo]
|
|
47
|
+
template: Dict
|
|
48
|
+
removal_scheduled_at: Optional[str]
|
|
49
|
+
jupyter_installation_status: Optional[str]
|
|
50
|
+
jupyter_url: Optional[str]
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def host(self) -> Optional[str]:
|
|
54
|
+
return (re.findall(r'@(\S+)', self.ssh_cmd) or [None])[0] if self.ssh_cmd else None
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def username(self) -> Optional[str]:
|
|
58
|
+
return (re.findall(r'ssh (\S+)@', self.ssh_cmd) or [None])[0] if self.ssh_cmd else None
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def ssh_port(self) -> int:
|
|
62
|
+
"""Extract SSH port from command."""
|
|
63
|
+
if not self.ssh_cmd or '-p ' not in self.ssh_cmd:
|
|
64
|
+
return 22
|
|
65
|
+
return int(self.ssh_cmd.split('-p ')[1].split()[0])
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class Template:
|
|
70
|
+
"""Template information."""
|
|
71
|
+
id: str
|
|
72
|
+
name: str
|
|
73
|
+
huid: str
|
|
74
|
+
docker_image: str
|
|
75
|
+
docker_image_tag: str
|
|
76
|
+
category: str
|
|
77
|
+
status: str
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class BackupConfig:
|
|
82
|
+
"""Backup configuration information."""
|
|
83
|
+
id: str
|
|
84
|
+
huid: str
|
|
85
|
+
pod_executor_id: str
|
|
86
|
+
backup_frequency_hours: int
|
|
87
|
+
retention_days: int
|
|
88
|
+
backup_path: str
|
|
89
|
+
is_active: bool
|
|
90
|
+
created_at: str
|
|
91
|
+
updated_at: Optional[str] = None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class BackupLog:
|
|
96
|
+
"""Backup log information."""
|
|
97
|
+
id: str
|
|
98
|
+
huid: str
|
|
99
|
+
backup_config_id: str
|
|
100
|
+
status: str
|
|
101
|
+
started_at: str
|
|
102
|
+
completed_at: Optional[str] = None
|
|
103
|
+
error_message: Optional[str] = None
|
|
104
|
+
progress: Optional[float] = None
|
|
105
|
+
backup_volume_id: Optional[str] = None
|
|
106
|
+
created_at: Optional[str] = None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@dataclass
|
|
110
|
+
class VolumeInfo:
|
|
111
|
+
"""Volume information."""
|
|
112
|
+
id: str
|
|
113
|
+
huid: str
|
|
114
|
+
name: str
|
|
115
|
+
description: str
|
|
116
|
+
created_at: str
|
|
117
|
+
updated_at: Optional[str] = None
|
|
118
|
+
current_size_bytes: int = 0
|
|
119
|
+
current_file_count: int = 0
|
|
120
|
+
current_size_gb: float = 0.0
|
|
121
|
+
current_size_mb: float = 0.0
|
|
122
|
+
last_metrics_update: Optional[str] = None
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
__all__ = [
|
|
126
|
+
"ExecutorInfo",
|
|
127
|
+
"PodInfo",
|
|
128
|
+
"Template",
|
|
129
|
+
"BackupConfig",
|
|
130
|
+
"BackupLog",
|
|
131
|
+
"VolumeInfo",
|
|
132
|
+
]
|