clouditia 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clouditia/__init__.py +57 -0
- clouditia/client.py +656 -0
- clouditia/exceptions.py +114 -0
- clouditia/jobs.py +325 -0
- clouditia/magic.py +225 -0
- clouditia/py.typed +2 -0
- clouditia/results.py +116 -0
- clouditia-1.0.0.dist-info/METADATA +813 -0
- clouditia-1.0.0.dist-info/RECORD +12 -0
- clouditia-1.0.0.dist-info/WHEEL +5 -0
- clouditia-1.0.0.dist-info/licenses/LICENSE +21 -0
- clouditia-1.0.0.dist-info/top_level.txt +1 -0
clouditia/__init__.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Clouditia SDK - Execute Python and Shell code on remote GPU sessions.
|
|
3
|
+
|
|
4
|
+
This SDK provides a simple interface to interact with Clouditia GPU sessions,
|
|
5
|
+
allowing you to run Python code, shell commands, and long-running async jobs
|
|
6
|
+
on remote GPU-powered containers.
|
|
7
|
+
|
|
8
|
+
Basic Usage:
|
|
9
|
+
>>> from clouditia import GPUSession
|
|
10
|
+
>>> session = GPUSession("your_api_key")
|
|
11
|
+
>>> result = session.run("print('Hello from GPU!')")
|
|
12
|
+
>>> print(result.output)
|
|
13
|
+
Hello from GPU!
|
|
14
|
+
|
|
15
|
+
For more examples, see: https://clouditia.com/docs
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
__version__ = "1.0.0"
|
|
19
|
+
__author__ = "Clouditia Team"
|
|
20
|
+
__email__ = "support@clouditia.com"
|
|
21
|
+
|
|
22
|
+
from .client import GPUSession, connect
|
|
23
|
+
from .jobs import AsyncJob
|
|
24
|
+
from .results import ExecutionResult
|
|
25
|
+
from .exceptions import (
|
|
26
|
+
ClouditiaError,
|
|
27
|
+
AuthenticationError,
|
|
28
|
+
SessionError,
|
|
29
|
+
ExecutionError,
|
|
30
|
+
TimeoutError,
|
|
31
|
+
CommandBlockedError
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Jupyter magic loader
|
|
35
|
+
def load_ipython_extension(ipython):
|
|
36
|
+
"""Load the Clouditia Jupyter magic extension."""
|
|
37
|
+
from .magic import load_ipython_extension as load_magic
|
|
38
|
+
load_magic(ipython)
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
# Main classes
|
|
42
|
+
"GPUSession",
|
|
43
|
+
"AsyncJob",
|
|
44
|
+
"ExecutionResult",
|
|
45
|
+
# Convenience functions
|
|
46
|
+
"connect",
|
|
47
|
+
"load_ipython_extension",
|
|
48
|
+
# Exceptions
|
|
49
|
+
"ClouditiaError",
|
|
50
|
+
"AuthenticationError",
|
|
51
|
+
"SessionError",
|
|
52
|
+
"ExecutionError",
|
|
53
|
+
"TimeoutError",
|
|
54
|
+
"CommandBlockedError",
|
|
55
|
+
# Version info
|
|
56
|
+
"__version__",
|
|
57
|
+
]
|
clouditia/client.py
ADDED
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Clouditia SDK - Main Client
|
|
3
|
+
|
|
4
|
+
This module provides the GPUSession class for interacting with remote GPU sessions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import base64
|
|
8
|
+
import inspect
|
|
9
|
+
import json
|
|
10
|
+
import pickle
|
|
11
|
+
import textwrap
|
|
12
|
+
from functools import wraps
|
|
13
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
14
|
+
|
|
15
|
+
import requests
|
|
16
|
+
|
|
17
|
+
from .exceptions import (
|
|
18
|
+
AuthenticationError,
|
|
19
|
+
ClouditiaError,
|
|
20
|
+
CommandBlockedError,
|
|
21
|
+
ExecutionError,
|
|
22
|
+
SessionError,
|
|
23
|
+
TimeoutError,
|
|
24
|
+
)
|
|
25
|
+
from .jobs import AsyncJob
|
|
26
|
+
from .results import ExecutionResult
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class GPUSession:
|
|
30
|
+
"""
|
|
31
|
+
Main class for interacting with a Clouditia GPU session.
|
|
32
|
+
|
|
33
|
+
A GPUSession represents a connection to a remote GPU-powered container.
|
|
34
|
+
You can execute Python code, run shell commands, and manage long-running
|
|
35
|
+
async jobs through this interface.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
api_key (str): Your Clouditia API key
|
|
39
|
+
base_url (str): Base URL of the Clouditia API
|
|
40
|
+
timeout (int): Default timeout for synchronous operations (seconds)
|
|
41
|
+
poll_interval (int): Interval for polling async jobs (seconds)
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
>>> from clouditia import GPUSession
|
|
45
|
+
>>>
|
|
46
|
+
>>> # Create a session
|
|
47
|
+
>>> session = GPUSession("ck_your_api_key_here")
|
|
48
|
+
>>>
|
|
49
|
+
>>> # Execute Python code
|
|
50
|
+
>>> result = session.run("print('Hello from GPU!')")
|
|
51
|
+
>>> print(result.output)
|
|
52
|
+
Hello from GPU!
|
|
53
|
+
>>>
|
|
54
|
+
>>> # Execute shell commands
|
|
55
|
+
>>> result = session.shell("ls -la")
|
|
56
|
+
>>> print(result.output)
|
|
57
|
+
>>>
|
|
58
|
+
>>> # Submit long-running jobs
|
|
59
|
+
>>> job = session.submit("train_model()", name="training")
|
|
60
|
+
>>> job.wait(show_logs=True)
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
DEFAULT_BASE_URL = "https://clouditia.com/code-editor"
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
api_key: str,
|
|
68
|
+
base_url: Optional[str] = None,
|
|
69
|
+
timeout: int = 120,
|
|
70
|
+
poll_interval: int = 5
|
|
71
|
+
):
|
|
72
|
+
"""
|
|
73
|
+
Initialize a GPU session.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
api_key: Your Clouditia API key (starts with 'ck_' or 'sk_')
|
|
77
|
+
base_url: Base URL of the API (default: https://clouditia.com/code-editor)
|
|
78
|
+
timeout: Default timeout for synchronous operations in seconds
|
|
79
|
+
poll_interval: Interval for polling async job status in seconds
|
|
80
|
+
|
|
81
|
+
Example:
|
|
82
|
+
>>> session = GPUSession("ck_abc123...")
|
|
83
|
+
>>> session = GPUSession("ck_abc123...", timeout=300)
|
|
84
|
+
"""
|
|
85
|
+
if not api_key:
|
|
86
|
+
raise AuthenticationError("API key is required")
|
|
87
|
+
|
|
88
|
+
self.api_key = api_key
|
|
89
|
+
self.base_url = (base_url or self.DEFAULT_BASE_URL).rstrip("/")
|
|
90
|
+
self.timeout = timeout
|
|
91
|
+
self.poll_interval = poll_interval
|
|
92
|
+
self._session_info: Optional[Dict] = None
|
|
93
|
+
self._remote_vars: Dict[str, bool] = {}
|
|
94
|
+
|
|
95
|
+
def _headers(self) -> Dict[str, str]:
|
|
96
|
+
"""Get headers with authentication."""
|
|
97
|
+
return {
|
|
98
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
99
|
+
"Content-Type": "application/json"
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
def _check_response(self, response: requests.Response) -> Dict:
|
|
103
|
+
"""Check response and raise appropriate exceptions."""
|
|
104
|
+
try:
|
|
105
|
+
data = response.json()
|
|
106
|
+
except json.JSONDecodeError:
|
|
107
|
+
raise ClouditiaError(f"Invalid JSON response: {response.text[:200]}")
|
|
108
|
+
|
|
109
|
+
if response.status_code == 401:
|
|
110
|
+
raise AuthenticationError(data.get("error", "Invalid API key"))
|
|
111
|
+
elif response.status_code == 403:
|
|
112
|
+
error_msg = data.get("error", "Access forbidden")
|
|
113
|
+
if "blocked" in error_msg.lower() or "blacklist" in error_msg.lower():
|
|
114
|
+
raise CommandBlockedError(error_msg)
|
|
115
|
+
raise SessionError(error_msg)
|
|
116
|
+
elif response.status_code == 404:
|
|
117
|
+
raise SessionError(data.get("error", "Resource not found"))
|
|
118
|
+
elif response.status_code >= 400:
|
|
119
|
+
raise ClouditiaError(data.get("error", f"HTTP {response.status_code}"))
|
|
120
|
+
|
|
121
|
+
if not data.get("success", True):
|
|
122
|
+
error = data.get("error", "Unknown error")
|
|
123
|
+
if "timeout" in error.lower():
|
|
124
|
+
raise TimeoutError(error)
|
|
125
|
+
if "blocked" in error.lower():
|
|
126
|
+
raise CommandBlockedError(error)
|
|
127
|
+
raise ExecutionError(error)
|
|
128
|
+
|
|
129
|
+
return data
|
|
130
|
+
|
|
131
|
+
def verify(self) -> Dict:
|
|
132
|
+
"""
|
|
133
|
+
Verify the API key and get session information.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Dict containing session_id, gpu_type, status, URLs, and billing info.
|
|
137
|
+
|
|
138
|
+
Raises:
|
|
139
|
+
AuthenticationError: If API key is invalid
|
|
140
|
+
SessionError: If session is not active
|
|
141
|
+
|
|
142
|
+
Example:
|
|
143
|
+
>>> info = session.verify()
|
|
144
|
+
>>> print(f"GPU: {info['gpu_type']}")
|
|
145
|
+
>>> print(f"Credit: {info['user_credit']}€")
|
|
146
|
+
"""
|
|
147
|
+
response = requests.get(
|
|
148
|
+
f"{self.base_url}/api/verify/",
|
|
149
|
+
headers=self._headers(),
|
|
150
|
+
timeout=30
|
|
151
|
+
)
|
|
152
|
+
data = self._check_response(response)
|
|
153
|
+
self._session_info = data
|
|
154
|
+
return data
|
|
155
|
+
|
|
156
|
+
def run(self, code: str, timeout: Optional[int] = None) -> ExecutionResult:
|
|
157
|
+
"""
|
|
158
|
+
Execute Python code on the remote GPU.
|
|
159
|
+
|
|
160
|
+
This is a synchronous call - it waits for the code to complete
|
|
161
|
+
and returns the result. For long-running code, use submit() instead.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
code: Python code to execute
|
|
165
|
+
timeout: Timeout in seconds (default: self.timeout)
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
ExecutionResult with output, result, and status.
|
|
169
|
+
|
|
170
|
+
Raises:
|
|
171
|
+
ExecutionError: If code execution fails
|
|
172
|
+
TimeoutError: If execution times out
|
|
173
|
+
|
|
174
|
+
Example:
|
|
175
|
+
>>> # Simple execution
|
|
176
|
+
>>> result = session.run("print('Hello!')")
|
|
177
|
+
>>> print(result.output) # "Hello!"
|
|
178
|
+
|
|
179
|
+
>>> # Get return value
|
|
180
|
+
>>> result = session.run("2 + 2")
|
|
181
|
+
>>> print(result.result) # "4"
|
|
182
|
+
|
|
183
|
+
>>> # Multi-line code
|
|
184
|
+
>>> result = session.run('''
|
|
185
|
+
... import torch
|
|
186
|
+
... x = torch.randn(100, 100, device='cuda')
|
|
187
|
+
... print(f"Shape: {x.shape}")
|
|
188
|
+
... ''')
|
|
189
|
+
"""
|
|
190
|
+
timeout = timeout or self.timeout
|
|
191
|
+
|
|
192
|
+
response = requests.post(
|
|
193
|
+
f"{self.base_url}/api/execute/",
|
|
194
|
+
headers=self._headers(),
|
|
195
|
+
json={"code": code},
|
|
196
|
+
timeout=timeout + 10
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
data = self._check_response(response)
|
|
200
|
+
|
|
201
|
+
return ExecutionResult(
|
|
202
|
+
output=data.get("output", ""),
|
|
203
|
+
result=data.get("result"),
|
|
204
|
+
error=data.get("error", ""),
|
|
205
|
+
exit_code=int(data.get("exit_code", 0)),
|
|
206
|
+
success=data.get("success", True)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def exec(self, code: str, timeout: Optional[int] = None) -> bool:
|
|
210
|
+
"""
|
|
211
|
+
Execute Python code without returning a result.
|
|
212
|
+
|
|
213
|
+
Similar to run() but optimized for code that doesn't need a return value.
|
|
214
|
+
Raises an exception if execution fails.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
code: Python code to execute
|
|
218
|
+
timeout: Timeout in seconds
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
True if execution succeeded.
|
|
222
|
+
|
|
223
|
+
Raises:
|
|
224
|
+
ExecutionError: If code execution fails
|
|
225
|
+
|
|
226
|
+
Example:
|
|
227
|
+
>>> session.exec("import torch")
|
|
228
|
+
>>> session.exec("model = load_model()")
|
|
229
|
+
"""
|
|
230
|
+
result = self.run(code, timeout)
|
|
231
|
+
if not result.success:
|
|
232
|
+
raise ExecutionError(result.error)
|
|
233
|
+
return True
|
|
234
|
+
|
|
235
|
+
def shell(self, command: str, timeout: Optional[int] = None) -> ExecutionResult:
|
|
236
|
+
"""
|
|
237
|
+
Execute a shell command on the remote GPU pod.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
command: Shell command to execute
|
|
241
|
+
timeout: Timeout in seconds
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
ExecutionResult with command output.
|
|
245
|
+
|
|
246
|
+
Raises:
|
|
247
|
+
CommandBlockedError: If command is blocked by security filters
|
|
248
|
+
ExecutionError: If command fails
|
|
249
|
+
|
|
250
|
+
Example:
|
|
251
|
+
>>> # List files
|
|
252
|
+
>>> result = session.shell("ls -la /workspace")
|
|
253
|
+
>>> print(result.output)
|
|
254
|
+
|
|
255
|
+
>>> # Check disk space
|
|
256
|
+
>>> result = session.shell("df -h")
|
|
257
|
+
|
|
258
|
+
>>> # Multiple commands
|
|
259
|
+
>>> result = session.shell("cd /workspace && ls && pwd")
|
|
260
|
+
"""
|
|
261
|
+
timeout = timeout or self.timeout
|
|
262
|
+
|
|
263
|
+
response = requests.post(
|
|
264
|
+
f"{self.base_url}/api/shell/",
|
|
265
|
+
headers=self._headers(),
|
|
266
|
+
json={"command": command},
|
|
267
|
+
timeout=timeout + 10
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
data = self._check_response(response)
|
|
271
|
+
|
|
272
|
+
return ExecutionResult(
|
|
273
|
+
output=data.get("output", ""),
|
|
274
|
+
result=None,
|
|
275
|
+
error=data.get("error", ""),
|
|
276
|
+
exit_code=int(data.get("exit_code", 0)),
|
|
277
|
+
success=data.get("success", True)
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
def set(self, name: str, value: Any) -> bool:
|
|
281
|
+
"""
|
|
282
|
+
Send a local variable to the remote GPU session.
|
|
283
|
+
|
|
284
|
+
The variable is serialized with pickle and stored in the Python
|
|
285
|
+
environment on the remote GPU pod.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
name: Variable name on the remote session
|
|
289
|
+
value: Value to send (must be picklable)
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
True if successful.
|
|
293
|
+
|
|
294
|
+
Raises:
|
|
295
|
+
ClouditiaError: If value cannot be serialized
|
|
296
|
+
|
|
297
|
+
Example:
|
|
298
|
+
>>> # Send data to GPU
|
|
299
|
+
>>> session.set("data", [1, 2, 3, 4, 5])
|
|
300
|
+
>>> session.run("print(sum(data))") # prints: 15
|
|
301
|
+
|
|
302
|
+
>>> # Send numpy arrays
|
|
303
|
+
>>> import numpy as np
|
|
304
|
+
>>> session.set("arr", np.random.randn(100, 100))
|
|
305
|
+
"""
|
|
306
|
+
try:
|
|
307
|
+
pickled = pickle.dumps(value)
|
|
308
|
+
value_b64 = base64.b64encode(pickled).decode()
|
|
309
|
+
except Exception as e:
|
|
310
|
+
raise ClouditiaError(f"Cannot serialize value: {e}")
|
|
311
|
+
|
|
312
|
+
code = f'''
|
|
313
|
+
import pickle
|
|
314
|
+
import base64
|
|
315
|
+
{name} = pickle.loads(base64.b64decode("{value_b64}"))
|
|
316
|
+
globals()["{name}"] = {name}
|
|
317
|
+
'''
|
|
318
|
+
result = self.run(code)
|
|
319
|
+
if not result.success:
|
|
320
|
+
raise ExecutionError(f"Failed to set variable: {result.error}")
|
|
321
|
+
|
|
322
|
+
self._remote_vars[name] = True
|
|
323
|
+
return True
|
|
324
|
+
|
|
325
|
+
def get(self, name: str) -> Any:
|
|
326
|
+
"""
|
|
327
|
+
Retrieve a variable from the remote GPU session.
|
|
328
|
+
|
|
329
|
+
The variable is serialized on the remote pod and deserialized locally.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
name: Variable name to retrieve
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
The value of the variable.
|
|
336
|
+
|
|
337
|
+
Raises:
|
|
338
|
+
ExecutionError: If variable doesn't exist or can't be retrieved
|
|
339
|
+
|
|
340
|
+
Example:
|
|
341
|
+
>>> session.run("result = [i**2 for i in range(10)]")
|
|
342
|
+
>>> data = session.get("result")
|
|
343
|
+
>>> print(data) # [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
|
|
344
|
+
"""
|
|
345
|
+
code = f'''
|
|
346
|
+
import pickle
|
|
347
|
+
import base64
|
|
348
|
+
_tmp_value = base64.b64encode(pickle.dumps({name})).decode()
|
|
349
|
+
_tmp_value
|
|
350
|
+
'''
|
|
351
|
+
result = self.run(code)
|
|
352
|
+
|
|
353
|
+
if not result.success:
|
|
354
|
+
raise ExecutionError(f"Failed to get variable: {result.error}")
|
|
355
|
+
|
|
356
|
+
if result.result is None:
|
|
357
|
+
raise ExecutionError(f"Variable '{name}' not found or not serializable")
|
|
358
|
+
|
|
359
|
+
try:
|
|
360
|
+
value_b64 = result.result.strip().strip("'\"")
|
|
361
|
+
pickled = base64.b64decode(value_b64)
|
|
362
|
+
return pickle.loads(pickled)
|
|
363
|
+
except Exception as e:
|
|
364
|
+
raise ClouditiaError(f"Cannot deserialize value: {e}")
|
|
365
|
+
|
|
366
|
+
def submit(
|
|
367
|
+
self,
|
|
368
|
+
code: str,
|
|
369
|
+
name: Optional[str] = None,
|
|
370
|
+
job_type: str = "python"
|
|
371
|
+
) -> AsyncJob:
|
|
372
|
+
"""
|
|
373
|
+
Submit a long-running job for asynchronous execution.
|
|
374
|
+
|
|
375
|
+
Use this for tasks that take hours or days (like model training).
|
|
376
|
+
The job runs in the background and you can monitor its progress.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
code: Python code or shell command to execute
|
|
380
|
+
name: Optional name to identify the job
|
|
381
|
+
job_type: "python" or "shell"
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
AsyncJob instance for monitoring and control.
|
|
385
|
+
|
|
386
|
+
Example:
|
|
387
|
+
>>> job = session.submit('''
|
|
388
|
+
... for epoch in range(100):
|
|
389
|
+
... print(f"Epoch {epoch}/100")
|
|
390
|
+
... train_one_epoch()
|
|
391
|
+
... save_model()
|
|
392
|
+
... ''', name="model_training")
|
|
393
|
+
>>>
|
|
394
|
+
>>> print(f"Job ID: {job.job_id}")
|
|
395
|
+
>>>
|
|
396
|
+
>>> # Wait with live logs
|
|
397
|
+
>>> result = job.wait(show_logs=True)
|
|
398
|
+
|
|
399
|
+
>>> # Or poll manually
|
|
400
|
+
>>> while not job.is_done():
|
|
401
|
+
... print(job.logs(new_only=True))
|
|
402
|
+
... time.sleep(30)
|
|
403
|
+
"""
|
|
404
|
+
response = requests.post(
|
|
405
|
+
f"{self.base_url}/api/jobs/submit/",
|
|
406
|
+
headers=self._headers(),
|
|
407
|
+
json={
|
|
408
|
+
"code": code,
|
|
409
|
+
"name": name or "",
|
|
410
|
+
"type": job_type
|
|
411
|
+
},
|
|
412
|
+
timeout=30
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
data = self._check_response(response)
|
|
416
|
+
|
|
417
|
+
return AsyncJob(
|
|
418
|
+
session=self,
|
|
419
|
+
job_id=data["job_id"],
|
|
420
|
+
name=name
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
def jobs(
|
|
424
|
+
self,
|
|
425
|
+
status: Optional[str] = None,
|
|
426
|
+
limit: int = 10
|
|
427
|
+
) -> List[AsyncJob]:
|
|
428
|
+
"""
|
|
429
|
+
List async jobs for this session.
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
status: Filter by status (pending, running, completed, failed, cancelled)
|
|
433
|
+
limit: Maximum number of jobs to return
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
List of AsyncJob instances.
|
|
437
|
+
|
|
438
|
+
Example:
|
|
439
|
+
>>> # List all running jobs
|
|
440
|
+
>>> running = session.jobs(status="running")
|
|
441
|
+
>>> for job in running:
|
|
442
|
+
... print(f"{job.name}: {job.status()}")
|
|
443
|
+
|
|
444
|
+
>>> # List recent completed jobs
|
|
445
|
+
>>> completed = session.jobs(status="completed", limit=5)
|
|
446
|
+
"""
|
|
447
|
+
params = {"limit": limit}
|
|
448
|
+
if status:
|
|
449
|
+
params["status"] = status
|
|
450
|
+
|
|
451
|
+
response = requests.get(
|
|
452
|
+
f"{self.base_url}/api/jobs/list/",
|
|
453
|
+
headers=self._headers(),
|
|
454
|
+
params=params,
|
|
455
|
+
timeout=30
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
data = self._check_response(response)
|
|
459
|
+
|
|
460
|
+
return [
|
|
461
|
+
AsyncJob(
|
|
462
|
+
session=self,
|
|
463
|
+
job_id=job["id"],
|
|
464
|
+
name=job.get("name"),
|
|
465
|
+
_data=job
|
|
466
|
+
)
|
|
467
|
+
for job in data.get("jobs", [])
|
|
468
|
+
]
|
|
469
|
+
|
|
470
|
+
def remote(
|
|
471
|
+
self,
|
|
472
|
+
func: Optional[Callable] = None,
|
|
473
|
+
**kwargs
|
|
474
|
+
) -> Callable:
|
|
475
|
+
"""
|
|
476
|
+
Decorator to execute a function on the remote GPU.
|
|
477
|
+
|
|
478
|
+
The function is serialized, sent to the GPU, executed, and the
|
|
479
|
+
result is returned locally. All arguments must be picklable.
|
|
480
|
+
|
|
481
|
+
Args:
|
|
482
|
+
func: Function to decorate
|
|
483
|
+
**kwargs: Options (async_mode, timeout)
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
Decorated function that runs on remote GPU.
|
|
487
|
+
|
|
488
|
+
Example:
|
|
489
|
+
>>> @session.remote
|
|
490
|
+
... def compute_on_gpu(data):
|
|
491
|
+
... import torch
|
|
492
|
+
... tensor = torch.tensor(data, device='cuda')
|
|
493
|
+
... return (tensor ** 2).sum().item()
|
|
494
|
+
>>>
|
|
495
|
+
>>> result = compute_on_gpu([1, 2, 3, 4, 5])
|
|
496
|
+
>>> print(result) # 55
|
|
497
|
+
|
|
498
|
+
>>> # Async mode
|
|
499
|
+
>>> @session.remote(async_mode=True)
|
|
500
|
+
... def train():
|
|
501
|
+
... # long training code
|
|
502
|
+
... pass
|
|
503
|
+
>>>
|
|
504
|
+
>>> job = train() # Returns AsyncJob
|
|
505
|
+
"""
|
|
506
|
+
async_mode = kwargs.get("async_mode", False)
|
|
507
|
+
timeout = kwargs.get("timeout", self.timeout)
|
|
508
|
+
|
|
509
|
+
def decorator(fn: Callable) -> Callable:
|
|
510
|
+
@wraps(fn)
|
|
511
|
+
def wrapper(*args, **kw):
|
|
512
|
+
# Get function source
|
|
513
|
+
source = inspect.getsource(fn)
|
|
514
|
+
source = textwrap.dedent(source)
|
|
515
|
+
|
|
516
|
+
# Remove decorator from source
|
|
517
|
+
lines = source.split("\n")
|
|
518
|
+
func_start = 0
|
|
519
|
+
for i, line in enumerate(lines):
|
|
520
|
+
if line.strip().startswith("def "):
|
|
521
|
+
func_start = i
|
|
522
|
+
break
|
|
523
|
+
source = "\n".join(lines[func_start:])
|
|
524
|
+
|
|
525
|
+
# Serialize arguments
|
|
526
|
+
args_b64 = base64.b64encode(pickle.dumps(args)).decode()
|
|
527
|
+
kwargs_b64 = base64.b64encode(pickle.dumps(kw)).decode()
|
|
528
|
+
|
|
529
|
+
exec_code = f'''
|
|
530
|
+
import pickle
|
|
531
|
+
import base64
|
|
532
|
+
|
|
533
|
+
# Define the function
|
|
534
|
+
{source}
|
|
535
|
+
|
|
536
|
+
# Deserialize arguments
|
|
537
|
+
_args = pickle.loads(base64.b64decode("{args_b64}"))
|
|
538
|
+
_kwargs = pickle.loads(base64.b64decode("{kwargs_b64}"))
|
|
539
|
+
|
|
540
|
+
# Execute function
|
|
541
|
+
_result = {fn.__name__}(*_args, **_kwargs)
|
|
542
|
+
|
|
543
|
+
# Serialize result
|
|
544
|
+
_result_b64 = base64.b64encode(pickle.dumps(_result)).decode()
|
|
545
|
+
_result_b64
|
|
546
|
+
'''
|
|
547
|
+
|
|
548
|
+
if async_mode:
|
|
549
|
+
return self.submit(exec_code, name=fn.__name__)
|
|
550
|
+
else:
|
|
551
|
+
result = self.run(exec_code, timeout=timeout)
|
|
552
|
+
|
|
553
|
+
if not result.success:
|
|
554
|
+
raise ExecutionError(f"Remote execution failed: {result.error}")
|
|
555
|
+
|
|
556
|
+
if result.result:
|
|
557
|
+
try:
|
|
558
|
+
value_b64 = result.result.strip().strip("'\"")
|
|
559
|
+
pickled = base64.b64decode(value_b64)
|
|
560
|
+
return pickle.loads(pickled)
|
|
561
|
+
except Exception as e:
|
|
562
|
+
raise ClouditiaError(f"Cannot deserialize result: {e}")
|
|
563
|
+
|
|
564
|
+
return None
|
|
565
|
+
|
|
566
|
+
return wrapper
|
|
567
|
+
|
|
568
|
+
if func is not None:
|
|
569
|
+
return decorator(func)
|
|
570
|
+
return decorator
|
|
571
|
+
|
|
572
|
+
def gpu_info(self) -> Dict:
|
|
573
|
+
"""
|
|
574
|
+
Get detailed GPU information from the remote pod.
|
|
575
|
+
|
|
576
|
+
Returns:
|
|
577
|
+
Dict with GPU details (name, memory, utilization).
|
|
578
|
+
|
|
579
|
+
Example:
|
|
580
|
+
>>> info = session.gpu_info()
|
|
581
|
+
>>> for gpu in info['gpus']:
|
|
582
|
+
... print(f"{gpu['name']}: {gpu['memory_used_mb']}/{gpu['memory_total_mb']} MB")
|
|
583
|
+
"""
|
|
584
|
+
# Use a safe command to get GPU info
|
|
585
|
+
result = self.run('''
|
|
586
|
+
import subprocess
|
|
587
|
+
result = subprocess.run(
|
|
588
|
+
["nvidia-smi", "--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu",
|
|
589
|
+
"--format=csv,noheader,nounits"],
|
|
590
|
+
capture_output=True, text=True
|
|
591
|
+
)
|
|
592
|
+
print(result.stdout)
|
|
593
|
+
''')
|
|
594
|
+
|
|
595
|
+
if not result.success:
|
|
596
|
+
raise ExecutionError("Failed to get GPU info")
|
|
597
|
+
|
|
598
|
+
lines = result.output.strip().split("\n")
|
|
599
|
+
gpus = []
|
|
600
|
+
|
|
601
|
+
for line in lines:
|
|
602
|
+
parts = [p.strip() for p in line.split(",")]
|
|
603
|
+
if len(parts) >= 5:
|
|
604
|
+
gpus.append({
|
|
605
|
+
"name": parts[0],
|
|
606
|
+
"memory_total_mb": int(parts[1]),
|
|
607
|
+
"memory_used_mb": int(parts[2]),
|
|
608
|
+
"memory_free_mb": int(parts[3]),
|
|
609
|
+
"utilization_percent": int(parts[4])
|
|
610
|
+
})
|
|
611
|
+
|
|
612
|
+
return {"gpus": gpus, "count": len(gpus)}
|
|
613
|
+
|
|
614
|
+
@property
|
|
615
|
+
def session_info(self) -> Optional[Dict]:
|
|
616
|
+
"""
|
|
617
|
+
Get cached session information.
|
|
618
|
+
|
|
619
|
+
Call verify() first to populate this property.
|
|
620
|
+
|
|
621
|
+
Returns:
|
|
622
|
+
Session info dict or None if not yet verified.
|
|
623
|
+
"""
|
|
624
|
+
return self._session_info
|
|
625
|
+
|
|
626
|
+
def __repr__(self) -> str:
|
|
627
|
+
"""Return string representation."""
|
|
628
|
+
key_preview = self.api_key[:10] + "..." if len(self.api_key) > 10 else self.api_key
|
|
629
|
+
return f"GPUSession(api_key='{key_preview}')"
|
|
630
|
+
|
|
631
|
+
def __str__(self) -> str:
|
|
632
|
+
"""Return human-readable string."""
|
|
633
|
+
if self._session_info:
|
|
634
|
+
return f"GPUSession({self._session_info.get('session_name', 'connected')})"
|
|
635
|
+
return f"GPUSession(not verified)"
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
def connect(api_key: str, **kwargs) -> GPUSession:
|
|
639
|
+
"""
|
|
640
|
+
Create and return a GPU session.
|
|
641
|
+
|
|
642
|
+
This is a convenience function equivalent to GPUSession(api_key).
|
|
643
|
+
|
|
644
|
+
Args:
|
|
645
|
+
api_key: Your Clouditia API key
|
|
646
|
+
**kwargs: Additional options for GPUSession
|
|
647
|
+
|
|
648
|
+
Returns:
|
|
649
|
+
Configured GPUSession instance.
|
|
650
|
+
|
|
651
|
+
Example:
|
|
652
|
+
>>> from clouditia import connect
|
|
653
|
+
>>> session = connect("ck_your_api_key")
|
|
654
|
+
>>> result = session.run("print('Hello!')")
|
|
655
|
+
"""
|
|
656
|
+
return GPUSession(api_key, **kwargs)
|