hypercli-sdk 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
c3/jobs.py ADDED
@@ -0,0 +1,285 @@
1
+ """Jobs API"""
2
+ import base64
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING, Iterator
5
+
6
+ if TYPE_CHECKING:
7
+ from .http import HTTPClient
8
+
9
+
10
+ @dataclass
11
+ class Job:
12
+ job_id: str
13
+ job_key: str
14
+ state: str
15
+ gpu_type: str
16
+ gpu_count: int
17
+ region: str
18
+ interruptible: bool
19
+ price_per_hour: float
20
+ price_per_second: float
21
+ docker_image: str
22
+ runtime: int
23
+ hostname: str | None = None
24
+ created_at: float | None = None
25
+ started_at: float | None = None
26
+ completed_at: float | None = None
27
+
28
+ @classmethod
29
+ def from_dict(cls, data: dict) -> "Job":
30
+ return cls(
31
+ job_id=data.get("job_id", ""),
32
+ job_key=data.get("job_key", ""),
33
+ state=data.get("state", ""),
34
+ gpu_type=data.get("gpu_type", ""),
35
+ gpu_count=data.get("gpu_count", 1),
36
+ region=data.get("region", ""),
37
+ interruptible=data.get("interruptible", True),
38
+ price_per_hour=data.get("price_per_hour", 0),
39
+ price_per_second=data.get("price_per_second", 0),
40
+ docker_image=data.get("docker_image", ""),
41
+ runtime=data.get("runtime", 0),
42
+ hostname=data.get("hostname"),
43
+ created_at=data.get("created_at"),
44
+ started_at=data.get("started_at"),
45
+ completed_at=data.get("completed_at"),
46
+ )
47
+
48
+
49
+ @dataclass
50
+ class GPUMetrics:
51
+ index: int
52
+ name: str
53
+ utilization: float
54
+ memory_used: float
55
+ memory_total: float
56
+ temperature: int
57
+ power_draw: float
58
+
59
+ @classmethod
60
+ def from_dict(cls, data: dict) -> "GPUMetrics":
61
+ return cls(
62
+ index=data.get("index", 0),
63
+ name=data.get("name", ""),
64
+ utilization=data.get("utilization_gpu_percent", 0),
65
+ memory_used=data.get("memory_used_mb", 0),
66
+ memory_total=data.get("memory_total_mb", 0),
67
+ temperature=data.get("temperature_c", 0),
68
+ power_draw=data.get("power_draw_w", 0),
69
+ )
70
+
71
+
72
+ @dataclass
73
+ class SystemMetrics:
74
+ cpu_percent: float
75
+ cpu_cores: float
76
+ cpu_unix_percent: float
77
+ memory_used: float
78
+ memory_limit: float
79
+
80
+ @classmethod
81
+ def from_dict(cls, data: dict) -> "SystemMetrics":
82
+ return cls(
83
+ cpu_percent=data.get("cpu_percent", 0),
84
+ cpu_cores=data.get("cpu_cores", 1),
85
+ cpu_unix_percent=data.get("cpu_unix_percent", data.get("cpu_percent", 0)),
86
+ memory_used=data.get("memory_used_mb", 0),
87
+ memory_limit=data.get("memory_limit_mb", 0),
88
+ )
89
+
90
+
91
+ @dataclass
92
+ class JobMetrics:
93
+ gpus: list[GPUMetrics] = field(default_factory=list)
94
+ system: SystemMetrics | None = None
95
+
96
+ @classmethod
97
+ def from_dict(cls, data: dict) -> "JobMetrics":
98
+ system_data = data.get("system")
99
+ return cls(
100
+ gpus=[GPUMetrics.from_dict(g) for g in data.get("gpus", [])],
101
+ system=SystemMetrics.from_dict(system_data) if system_data else None,
102
+ )
103
+
104
+
105
+ class Jobs:
106
+ """Jobs API wrapper"""
107
+
108
+ def __init__(self, http: "HTTPClient"):
109
+ self._http = http
110
+
111
+ def list(self, state: str = None) -> list[Job]:
112
+ """List all jobs"""
113
+ params = {"state": state} if state else None
114
+ data = self._http.get("/api/jobs", params=params)
115
+ # API returns {"jobs": [...], "total_count": ...}
116
+ jobs = data.get("jobs", []) if isinstance(data, dict) else data
117
+ return [Job.from_dict(j) for j in jobs]
118
+
119
+ def get(self, job_id: str) -> Job:
120
+ """Get job details"""
121
+ data = self._http.get(f"/api/jobs/{job_id}")
122
+ return Job.from_dict(data)
123
+
124
+ def create(
125
+ self,
126
+ image: str,
127
+ command: str = None,
128
+ gpu_type: str = "l40s",
129
+ gpu_count: int = 1,
130
+ region: str = None,
131
+ runtime: int = None,
132
+ interruptible: bool = True,
133
+ env: dict[str, str] = None,
134
+ ports: dict[str, int] = None,
135
+ auth: bool = False,
136
+ ) -> Job:
137
+ """Create a new job.
138
+
139
+ Args:
140
+ image: Docker image to run
141
+ command: Command to execute (base64 encoded internally)
142
+ gpu_type: GPU type (e.g., "l40s", "a100")
143
+ gpu_count: Number of GPUs
144
+ region: Region to run in
145
+ runtime: Max runtime in seconds
146
+ interruptible: Allow spot/preemptible instances
147
+ env: Environment variables
148
+ ports: Ports to expose. Use {"lb": port} for HTTPS load balancer
149
+ auth: Enable Bearer token auth on load balancer (use with ports={"lb": port})
150
+ """
151
+ payload = {
152
+ "docker_image": image,
153
+ "gpu_type": gpu_type,
154
+ "gpu_count": gpu_count,
155
+ "interruptible": interruptible,
156
+ "command": base64.b64encode((command or "").encode()).decode(),
157
+ }
158
+ if region:
159
+ payload["region"] = region
160
+ if runtime:
161
+ payload["runtime"] = runtime
162
+ if env:
163
+ payload["env_vars"] = env
164
+ if ports:
165
+ payload["ports"] = ports
166
+ if auth:
167
+ payload["auth"] = auth
168
+
169
+ data = self._http.post("/api/jobs", json=payload)
170
+ return Job.from_dict(data)
171
+
172
+ def cancel(self, job_id: str) -> dict:
173
+ """Cancel a job"""
174
+ return self._http.delete(f"/api/jobs/{job_id}")
175
+
176
+ def extend(self, job_id: str, runtime: int) -> Job:
177
+ """Extend job runtime"""
178
+ data = self._http.patch(f"/api/jobs/{job_id}", json={"runtime": runtime})
179
+ return Job.from_dict(data)
180
+
181
+ def logs(self, job_id: str) -> str:
182
+ """Get job logs"""
183
+ data = self._http.get(f"/api/jobs/{job_id}/logs")
184
+ return data.get("logs", "")
185
+
186
+ def metrics(self, job_id: str) -> JobMetrics:
187
+ """Get job GPU metrics"""
188
+ data = self._http.get(f"/api/jobs/{job_id}/metrics")
189
+ return JobMetrics.from_dict(data)
190
+
191
+ def token(self, job_id: str) -> str:
192
+ """Get job auth token"""
193
+ data = self._http.get(f"/api/jobs/{job_id}/token")
194
+ return data.get("token", "")
195
+
196
+
197
+ # Utility functions for finding jobs
198
+
199
+
200
+ def is_uuid(s: str) -> bool:
201
+ """Check if string looks like a UUID (job ID)"""
202
+ return "-" in s and len(s) > 30
203
+
204
+
205
+ def find_by_id(jobs: Jobs, job_id: str) -> Job | None:
206
+ """Find job by UUID via direct API call.
207
+
208
+ Args:
209
+ jobs: Jobs API instance
210
+ job_id: Full job UUID
211
+
212
+ Returns:
213
+ Job if found, None if not found or error
214
+ """
215
+ try:
216
+ return jobs.get(job_id)
217
+ except Exception:
218
+ return None
219
+
220
+
221
+ def find_by_hostname(job_list: list[Job], hostname: str) -> Job | None:
222
+ """Find job by hostname (exact or prefix match).
223
+
224
+ Args:
225
+ job_list: List of Job objects to search
226
+ hostname: Hostname to match (can be partial prefix)
227
+
228
+ Returns:
229
+ First matching Job or None
230
+ """
231
+ for job in job_list:
232
+ if job.hostname and (job.hostname == hostname or job.hostname.startswith(hostname)):
233
+ return job
234
+ return None
235
+
236
+
237
+ def find_by_ip(job_list: list[Job], ip: str) -> Job | None:
238
+ """Find job by IP address (extracted from hostname).
239
+
240
+ Args:
241
+ job_list: List of Job objects to search
242
+ ip: IP address to match
243
+
244
+ Returns:
245
+ First matching Job or None
246
+ """
247
+ import socket
248
+
249
+ for job in job_list:
250
+ if not job.hostname:
251
+ continue
252
+ try:
253
+ job_ip = socket.gethostbyname(job.hostname)
254
+ if job_ip == ip:
255
+ return job
256
+ except socket.gaierror:
257
+ continue
258
+ return None
259
+
260
+
261
+ def find_job(jobs: Jobs, identifier: str, state: str = None) -> Job | None:
262
+ """Find a job by UUID, hostname, or IP address.
263
+
264
+ Args:
265
+ jobs: Jobs API instance
266
+ identifier: Job UUID, hostname (partial match), or IP address
267
+ state: Optional state filter for listing jobs
268
+
269
+ Returns:
270
+ Matching Job or None
271
+ """
272
+ # Try UUID first (direct API call)
273
+ if is_uuid(identifier):
274
+ return find_by_id(jobs, identifier)
275
+
276
+ # Get job list for hostname/IP search
277
+ job_list = jobs.list(state=state)
278
+
279
+ # Try hostname match
280
+ job = find_by_hostname(job_list, identifier)
281
+ if job:
282
+ return job
283
+
284
+ # Try IP match (slower, requires DNS lookup)
285
+ return find_by_ip(job_list, identifier)
c3/logs.py ADDED
@@ -0,0 +1,273 @@
1
+ """Async log streaming for jobs"""
2
+ import asyncio
3
+ import json
4
+ from collections import deque
5
+ from typing import TYPE_CHECKING, AsyncIterator, Callable
6
+
7
+ import websockets
8
+
9
+ from .config import get_ws_url, WS_LOGS_PATH
10
+
11
+ if TYPE_CHECKING:
12
+ from .client import C3
13
+
14
+
15
+ # Default limits to prevent memory blowup
16
+ DEFAULT_MAX_INITIAL_LINES = 1000 # Max lines to fetch on initial REST call
17
+ DEFAULT_MAX_BUFFER = 5000 # Max lines to keep in memory buffer
18
+
19
+
20
+ def fetch_logs(c3: "C3", job_id: str, tail: int = None) -> list[str]:
21
+ """Fetch logs via REST API (one-time call).
22
+
23
+ Args:
24
+ c3: C3 client
25
+ job_id: Job ID
26
+ tail: Only return last N lines (default: all)
27
+
28
+ Returns:
29
+ List of log lines
30
+ """
31
+ try:
32
+ logs = c3.jobs.logs(job_id)
33
+ if not logs:
34
+ return []
35
+ lines = logs.strip().split("\n")
36
+ if tail and len(lines) > tail:
37
+ return lines[-tail:]
38
+ return lines
39
+ except Exception:
40
+ return []
41
+
42
+
43
+ class LogStream:
44
+ """Async log streamer - websocket streaming with optional initial fetch.
45
+
46
+ Usage:
47
+ stream = LogStream(c3, job)
48
+ await stream.connect()
49
+ async for line in stream:
50
+ print(line)
51
+ await stream.close()
52
+
53
+ This class guarantees:
54
+ - Initial logs fetched ONCE on connect (limited to max_initial_lines)
55
+ - All subsequent logs via websocket (NO polling)
56
+ - Bounded buffer to prevent memory blowup
57
+ - Proper cleanup on close
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ c3: "C3",
63
+ job_id: str,
64
+ job_key: str = None,
65
+ fetch_initial: bool = True,
66
+ max_initial_lines: int = DEFAULT_MAX_INITIAL_LINES,
67
+ max_buffer: int = DEFAULT_MAX_BUFFER,
68
+ ):
69
+ """
70
+ Args:
71
+ c3: C3 client
72
+ job_id: Job ID for REST log fetch
73
+ job_key: Job key for websocket (if None, fetched from job)
74
+ fetch_initial: Whether to fetch existing logs on connect
75
+ max_initial_lines: Max lines to fetch initially (prevents huge fetch)
76
+ max_buffer: Max lines to keep in buffer (oldest dropped)
77
+ """
78
+ self.c3 = c3
79
+ self.job_id = job_id
80
+ self.job_key = job_key
81
+ self.fetch_initial = fetch_initial
82
+ self.max_initial_lines = max_initial_lines
83
+ self.max_buffer = max_buffer
84
+
85
+ self._ws = None
86
+ self._buffer: deque[str] = deque(maxlen=max_buffer)
87
+ self._initial_fetched = False
88
+ self._connected = False
89
+ self._closed = False
90
+
91
+ @property
92
+ def status(self) -> str:
93
+ """Connection status: disconnected, connecting, connected, closed"""
94
+ if self._closed:
95
+ return "closed"
96
+ if self._connected:
97
+ return "connected"
98
+ if self._ws:
99
+ return "connecting"
100
+ return "disconnected"
101
+
102
+ async def connect(self) -> list[str]:
103
+ """Connect to log stream.
104
+
105
+ Returns initial logs (if fetch_initial=True).
106
+ After this, iterate with `async for line in stream`.
107
+ """
108
+ if self._closed:
109
+ raise RuntimeError("LogStream is closed")
110
+
111
+ initial_lines = []
112
+
113
+ # Fetch initial logs ONCE (bounded)
114
+ if self.fetch_initial and not self._initial_fetched:
115
+ initial_lines = fetch_logs(self.c3, self.job_id, tail=self.max_initial_lines)
116
+ for line in initial_lines:
117
+ self._buffer.append(line)
118
+ self._initial_fetched = True
119
+
120
+ # Get job_key if not provided
121
+ if not self.job_key:
122
+ job = self.c3.jobs.get(self.job_id)
123
+ self.job_key = job.job_key
124
+
125
+ # Connect websocket
126
+ if self.job_key and not self._ws:
127
+ ws_url = get_ws_url()
128
+ full_url = f"{ws_url}{WS_LOGS_PATH}/{self.job_key}"
129
+ self._ws = await websockets.connect(full_url)
130
+ self._connected = True
131
+
132
+ return initial_lines
133
+
134
+ async def close(self):
135
+ """Close the websocket connection"""
136
+ self._closed = True
137
+ self._connected = False
138
+ if self._ws:
139
+ await self._ws.close()
140
+ self._ws = None
141
+
142
+ def get_buffer(self) -> list[str]:
143
+ """Get current buffer contents (bounded, oldest may be dropped)"""
144
+ return list(self._buffer)
145
+
146
+ def clear_buffer(self):
147
+ """Clear the buffer"""
148
+ self._buffer.clear()
149
+
150
+ async def __aiter__(self) -> AsyncIterator[str]:
151
+ """Async iterate over NEW log lines from websocket.
152
+
153
+ Note: This does NOT yield initial logs. Call connect() first
154
+ and handle the returned initial lines separately.
155
+ """
156
+ if not self._ws:
157
+ raise RuntimeError("Not connected. Call connect() first.")
158
+
159
+ try:
160
+ async for message in self._ws:
161
+ if self._closed:
162
+ break
163
+ try:
164
+ data = json.loads(message)
165
+ if data.get("event") == "log" and data.get("log"):
166
+ for line in data["log"].splitlines():
167
+ if line:
168
+ self._buffer.append(line)
169
+ yield line
170
+ except json.JSONDecodeError:
171
+ continue
172
+ except websockets.ConnectionClosed:
173
+ self._connected = False
174
+
175
+ async def __aenter__(self):
176
+ await self.connect()
177
+ return self
178
+
179
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
180
+ await self.close()
181
+
182
+
183
+ async def stream_logs(
184
+ c3: "C3",
185
+ job_id: str,
186
+ on_line: Callable[[str], None],
187
+ until_state: set[str] = None,
188
+ poll_state_interval: float = 2.0,
189
+ fetch_initial: bool = True,
190
+ fetch_final: bool = True,
191
+ max_initial_lines: int = DEFAULT_MAX_INITIAL_LINES,
192
+ ) -> None:
193
+ """Stream logs until job reaches a terminal state.
194
+
195
+ Args:
196
+ c3: C3 client
197
+ job_id: Job ID to stream logs from
198
+ on_line: Callback for each log line (called immediately, no buffering)
199
+ until_state: States to stop on (default: terminal states)
200
+ poll_state_interval: How often to check job STATE (NOT log polling!)
201
+ fetch_initial: Fetch existing logs on start
202
+ fetch_final: Fetch logs one more time after job terminates
203
+ max_initial_lines: Max lines to fetch initially
204
+
205
+ This function:
206
+ - Fetches initial logs ONCE (bounded)
207
+ - Streams via websocket (NO log polling)
208
+ - Polls job STATE only (to detect termination)
209
+ - Optionally fetches final logs ONCE when job terminates
210
+ """
211
+ if until_state is None:
212
+ until_state = {"succeeded", "failed", "canceled", "terminated"}
213
+
214
+ job = c3.jobs.get(job_id)
215
+ initial_fetched = False
216
+ ws = None
217
+
218
+ try:
219
+ # Wait for job to be assigned/running
220
+ while job.state in ("pending", "queued"):
221
+ await asyncio.sleep(poll_state_interval)
222
+ job = c3.jobs.get(job_id)
223
+
224
+ # Check for immediate terminal state
225
+ if job.state in until_state:
226
+ if fetch_final:
227
+ for line in fetch_logs(c3, job_id, tail=max_initial_lines):
228
+ on_line(line)
229
+ return
230
+
231
+ # Fetch initial logs ONCE when running (bounded)
232
+ if fetch_initial and job.state == "running" and not initial_fetched:
233
+ for line in fetch_logs(c3, job_id, tail=max_initial_lines):
234
+ on_line(line)
235
+ initial_fetched = True
236
+
237
+ # Connect websocket
238
+ if job.job_key:
239
+ ws_url = get_ws_url()
240
+ full_url = f"{ws_url}{WS_LOGS_PATH}/{job.job_key}"
241
+ ws = await websockets.connect(full_url)
242
+
243
+ # Stream logs while checking job state periodically
244
+ while True:
245
+ try:
246
+ # Wait for message with timeout to allow state checks
247
+ message = await asyncio.wait_for(ws.recv(), timeout=poll_state_interval)
248
+ try:
249
+ data = json.loads(message)
250
+ if data.get("event") == "log" and data.get("log"):
251
+ for line in data["log"].splitlines():
252
+ if line:
253
+ on_line(line)
254
+ except json.JSONDecodeError:
255
+ continue
256
+ except asyncio.TimeoutError:
257
+ # Check job state (NOT polling logs!)
258
+ job = c3.jobs.get(job_id)
259
+ if job.state in until_state:
260
+ break
261
+ except websockets.ConnectionClosed:
262
+ break
263
+
264
+ # Fetch final logs ONCE (may have missed some during shutdown)
265
+ if fetch_final:
266
+ # Small delay to let final logs flush
267
+ await asyncio.sleep(0.5)
268
+ for line in fetch_logs(c3, job_id, tail=max_initial_lines):
269
+ on_line(line)
270
+
271
+ finally:
272
+ if ws:
273
+ await ws.close()