hypercli-sdk 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- c3/__init__.py +57 -0
- c3/billing.py +72 -0
- c3/client.py +60 -0
- c3/config.py +70 -0
- c3/files.py +386 -0
- c3/http.py +217 -0
- c3/instances.py +211 -0
- c3/job/__init__.py +24 -0
- c3/job/base.py +249 -0
- c3/job/comfyui.py +1469 -0
- c3/jobs.py +285 -0
- c3/logs.py +273 -0
- c3/renders.py +339 -0
- c3/user.py +37 -0
- hypercli_sdk-0.4.2.dist-info/METADATA +141 -0
- hypercli_sdk-0.4.2.dist-info/RECORD +17 -0
- hypercli_sdk-0.4.2.dist-info/WHEEL +4 -0
c3/http.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""HTTP client utilities"""
|
|
2
|
+
import time
|
|
3
|
+
import httpx
|
|
4
|
+
from typing import Any, Optional, Iterator, Callable
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def request_with_retry(
|
|
9
|
+
method: str,
|
|
10
|
+
url: str,
|
|
11
|
+
headers: dict = None,
|
|
12
|
+
retries: int = 3,
|
|
13
|
+
backoff: float = 1.0,
|
|
14
|
+
timeout: float = 30.0,
|
|
15
|
+
**kwargs,
|
|
16
|
+
) -> httpx.Response:
|
|
17
|
+
"""Make an HTTP request with retry logic for transient errors.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
method: HTTP method (get, post, etc.)
|
|
21
|
+
url: Full URL to request
|
|
22
|
+
headers: Request headers
|
|
23
|
+
retries: Number of retry attempts
|
|
24
|
+
backoff: Backoff multiplier between retries
|
|
25
|
+
timeout: Request timeout in seconds
|
|
26
|
+
**kwargs: Additional args passed to httpx (json, params, etc.)
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
httpx.Response
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
Last exception if all retries fail
|
|
33
|
+
"""
|
|
34
|
+
last_error = None
|
|
35
|
+
for attempt in range(retries):
|
|
36
|
+
try:
|
|
37
|
+
with httpx.Client(timeout=timeout) as client:
|
|
38
|
+
resp = getattr(client, method)(url, headers=headers, **kwargs)
|
|
39
|
+
return resp
|
|
40
|
+
except (httpx.ProxyError, httpx.ConnectError, httpx.ReadTimeout) as e:
|
|
41
|
+
last_error = e
|
|
42
|
+
if attempt < retries - 1:
|
|
43
|
+
time.sleep(backoff * (attempt + 1))
|
|
44
|
+
continue
|
|
45
|
+
raise
|
|
46
|
+
raise last_error
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class APIError(Exception):
|
|
51
|
+
"""API error with status code and detail"""
|
|
52
|
+
status_code: int
|
|
53
|
+
detail: str
|
|
54
|
+
|
|
55
|
+
def __str__(self):
|
|
56
|
+
return f"API Error {self.status_code}: {self.detail}"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _handle_response(response: httpx.Response) -> Any:
|
|
60
|
+
"""Handle API response, raise on error"""
|
|
61
|
+
if response.status_code >= 400:
|
|
62
|
+
try:
|
|
63
|
+
detail = response.json().get("detail", response.text)
|
|
64
|
+
except Exception:
|
|
65
|
+
detail = response.text
|
|
66
|
+
raise APIError(response.status_code, detail)
|
|
67
|
+
if response.status_code == 204:
|
|
68
|
+
return None
|
|
69
|
+
return response.json()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class HTTPClient:
|
|
73
|
+
"""Sync HTTP client"""
|
|
74
|
+
|
|
75
|
+
def __init__(self, base_url: str, api_key: str, timeout: float = 30.0):
|
|
76
|
+
self.base_url = base_url.rstrip("/")
|
|
77
|
+
self.api_key = api_key
|
|
78
|
+
self.timeout = timeout
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def headers(self) -> dict:
|
|
82
|
+
return {
|
|
83
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
84
|
+
"Content-Type": "application/json",
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
def get(self, path: str, params: dict = None) -> Any:
|
|
88
|
+
resp = request_with_retry(
|
|
89
|
+
"get", f"{self.base_url}{path}",
|
|
90
|
+
headers=self.headers, timeout=self.timeout, params=params
|
|
91
|
+
)
|
|
92
|
+
return _handle_response(resp)
|
|
93
|
+
|
|
94
|
+
def post(self, path: str, json: dict = None) -> Any:
|
|
95
|
+
resp = request_with_retry(
|
|
96
|
+
"post", f"{self.base_url}{path}",
|
|
97
|
+
headers=self.headers, timeout=self.timeout, json=json
|
|
98
|
+
)
|
|
99
|
+
return _handle_response(resp)
|
|
100
|
+
|
|
101
|
+
def patch(self, path: str, json: dict = None) -> Any:
|
|
102
|
+
resp = request_with_retry(
|
|
103
|
+
"patch", f"{self.base_url}{path}",
|
|
104
|
+
headers=self.headers, timeout=self.timeout, json=json
|
|
105
|
+
)
|
|
106
|
+
return _handle_response(resp)
|
|
107
|
+
|
|
108
|
+
def delete(self, path: str) -> Any:
|
|
109
|
+
resp = request_with_retry(
|
|
110
|
+
"delete", f"{self.base_url}{path}",
|
|
111
|
+
headers=self.headers, timeout=self.timeout
|
|
112
|
+
)
|
|
113
|
+
return _handle_response(resp)
|
|
114
|
+
|
|
115
|
+
def stream_post(self, path: str, json: dict) -> Iterator[str]:
|
|
116
|
+
"""Streaming POST for SSE responses"""
|
|
117
|
+
with httpx.Client(timeout=None) as client:
|
|
118
|
+
with client.stream(
|
|
119
|
+
"POST",
|
|
120
|
+
f"{self.base_url}{path}",
|
|
121
|
+
headers=self.headers,
|
|
122
|
+
json=json,
|
|
123
|
+
) as response:
|
|
124
|
+
if response.status_code >= 400:
|
|
125
|
+
raise APIError(response.status_code, response.read().decode())
|
|
126
|
+
for line in response.iter_lines():
|
|
127
|
+
yield line
|
|
128
|
+
|
|
129
|
+
def post_multipart(self, path: str, files: dict) -> Any:
|
|
130
|
+
"""POST with multipart form data for file uploads.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
path: API path
|
|
134
|
+
files: Dict of {field_name: file_tuple} where file_tuple is
|
|
135
|
+
(filename, file_bytes, content_type) or just file_bytes
|
|
136
|
+
"""
|
|
137
|
+
# Build headers without Content-Type (httpx sets it for multipart)
|
|
138
|
+
headers = {"Authorization": f"Bearer {self.api_key}"}
|
|
139
|
+
|
|
140
|
+
with httpx.Client(timeout=self.timeout) as client:
|
|
141
|
+
response = client.post(
|
|
142
|
+
f"{self.base_url}{path}",
|
|
143
|
+
headers=headers,
|
|
144
|
+
files=files,
|
|
145
|
+
)
|
|
146
|
+
return _handle_response(response)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class AsyncHTTPClient:
|
|
150
|
+
"""Async HTTP client for use in async contexts (e.g., Telegram bot, web servers)"""
|
|
151
|
+
|
|
152
|
+
def __init__(self, base_url: str, api_key: str, timeout: float = 30.0):
|
|
153
|
+
self.base_url = base_url.rstrip("/")
|
|
154
|
+
self.api_key = api_key
|
|
155
|
+
self.timeout = timeout
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def headers(self) -> dict:
|
|
159
|
+
return {
|
|
160
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
161
|
+
"Content-Type": "application/json",
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
async def get(self, path: str, params: dict = None) -> Any:
|
|
165
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
166
|
+
response = await client.get(
|
|
167
|
+
f"{self.base_url}{path}",
|
|
168
|
+
headers=self.headers,
|
|
169
|
+
params=params,
|
|
170
|
+
)
|
|
171
|
+
return _handle_response(response)
|
|
172
|
+
|
|
173
|
+
async def post(self, path: str, json: dict = None) -> Any:
|
|
174
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
175
|
+
response = await client.post(
|
|
176
|
+
f"{self.base_url}{path}",
|
|
177
|
+
headers=self.headers,
|
|
178
|
+
json=json,
|
|
179
|
+
)
|
|
180
|
+
return _handle_response(response)
|
|
181
|
+
|
|
182
|
+
async def patch(self, path: str, json: dict = None) -> Any:
|
|
183
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
184
|
+
response = await client.patch(
|
|
185
|
+
f"{self.base_url}{path}",
|
|
186
|
+
headers=self.headers,
|
|
187
|
+
json=json,
|
|
188
|
+
)
|
|
189
|
+
return _handle_response(response)
|
|
190
|
+
|
|
191
|
+
async def delete(self, path: str) -> Any:
|
|
192
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
193
|
+
response = await client.delete(
|
|
194
|
+
f"{self.base_url}{path}",
|
|
195
|
+
headers=self.headers,
|
|
196
|
+
)
|
|
197
|
+
return _handle_response(response)
|
|
198
|
+
|
|
199
|
+
async def post_multipart(self, path: str, files: dict, params: dict = None) -> Any:
|
|
200
|
+
"""POST with multipart form data for file uploads.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
path: API path
|
|
204
|
+
files: Dict of {field_name: file_tuple} where file_tuple is
|
|
205
|
+
(filename, file_bytes, content_type) or just file_bytes
|
|
206
|
+
params: Optional query parameters
|
|
207
|
+
"""
|
|
208
|
+
headers = {"Authorization": f"Bearer {self.api_key}"}
|
|
209
|
+
|
|
210
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
211
|
+
response = await client.post(
|
|
212
|
+
f"{self.base_url}{path}",
|
|
213
|
+
headers=headers,
|
|
214
|
+
files=files,
|
|
215
|
+
params=params,
|
|
216
|
+
)
|
|
217
|
+
return _handle_response(response)
|
c3/instances.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""Instances API - GPU types, regions, and pricing"""
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from .http import HTTPClient
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class GPUConfig:
|
|
11
|
+
"""Configuration for a specific GPU count"""
|
|
12
|
+
gpu_count: int
|
|
13
|
+
cpu_cores: float
|
|
14
|
+
memory_gb: float
|
|
15
|
+
storage_gb: float
|
|
16
|
+
regions: list[str]
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def from_dict(cls, data: dict) -> "GPUConfig":
|
|
20
|
+
return cls(
|
|
21
|
+
gpu_count=data.get("gpu_count", 1),
|
|
22
|
+
cpu_cores=data.get("cpu_cores", 0),
|
|
23
|
+
memory_gb=data.get("memory_gb", 0),
|
|
24
|
+
storage_gb=data.get("storage_gb", 0),
|
|
25
|
+
regions=data.get("regions", []),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class GPUType:
|
|
31
|
+
"""GPU type with its configurations"""
|
|
32
|
+
id: str
|
|
33
|
+
name: str
|
|
34
|
+
description: str
|
|
35
|
+
configs: list[GPUConfig]
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def from_dict(cls, id: str, data: dict) -> "GPUType":
|
|
39
|
+
return cls(
|
|
40
|
+
id=id,
|
|
41
|
+
name=data.get("name", id),
|
|
42
|
+
description=data.get("description", ""),
|
|
43
|
+
configs=[GPUConfig.from_dict(c) for c in data.get("configs", [])],
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def available_regions(self, gpu_count: int = 1) -> list[str]:
|
|
47
|
+
"""Get regions where this GPU is available for given count"""
|
|
48
|
+
for config in self.configs:
|
|
49
|
+
if config.gpu_count == gpu_count:
|
|
50
|
+
return config.regions
|
|
51
|
+
return []
|
|
52
|
+
|
|
53
|
+
def available_counts(self) -> list[int]:
|
|
54
|
+
"""Get available GPU counts for this type"""
|
|
55
|
+
return [c.gpu_count for c in self.configs if c.regions]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class Region:
|
|
60
|
+
"""A datacenter region"""
|
|
61
|
+
id: str
|
|
62
|
+
description: str
|
|
63
|
+
country: str
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def from_dict(cls, id: str, data: dict) -> "Region":
|
|
67
|
+
return cls(
|
|
68
|
+
id=id,
|
|
69
|
+
description=data.get("description", id),
|
|
70
|
+
country=data.get("country", ""),
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class PricingTier:
|
|
76
|
+
"""Pricing for a specific region"""
|
|
77
|
+
region: str
|
|
78
|
+
on_demand: float | None = None
|
|
79
|
+
interruptible: float | None = None
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def from_dict(cls, region: str, data: dict) -> "PricingTier":
|
|
83
|
+
return cls(
|
|
84
|
+
region=region,
|
|
85
|
+
on_demand=data.get("on-demand"),
|
|
86
|
+
interruptible=data.get("interruptable"), # Note: API has typo
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class GPUPricing:
|
|
92
|
+
"""Pricing for a GPU configuration"""
|
|
93
|
+
gpu_type: str
|
|
94
|
+
gpu_count: int
|
|
95
|
+
tiers: list[PricingTier]
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def from_key(cls, key: str, data: dict) -> "GPUPricing":
|
|
99
|
+
# Parse key like "h100_x8" -> gpu_type="h100", gpu_count=8
|
|
100
|
+
parts = key.rsplit("_x", 1)
|
|
101
|
+
gpu_type = parts[0]
|
|
102
|
+
gpu_count = int(parts[1]) if len(parts) > 1 else 1
|
|
103
|
+
|
|
104
|
+
tiers = [PricingTier.from_dict(region, prices) for region, prices in data.items()]
|
|
105
|
+
return cls(gpu_type=gpu_type, gpu_count=gpu_count, tiers=tiers)
|
|
106
|
+
|
|
107
|
+
def get_price(self, region: str, interruptible: bool = True) -> float | None:
|
|
108
|
+
"""Get price for a specific region and tier"""
|
|
109
|
+
for tier in self.tiers:
|
|
110
|
+
if tier.region == region:
|
|
111
|
+
return tier.interruptible if interruptible else tier.on_demand
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class Instances:
|
|
116
|
+
"""Instances API - GPU types, regions, and pricing"""
|
|
117
|
+
|
|
118
|
+
def __init__(self, http: "HTTPClient"):
|
|
119
|
+
self._http = http
|
|
120
|
+
self._types_cache: dict[str, GPUType] | None = None
|
|
121
|
+
self._regions_cache: dict[str, Region] | None = None
|
|
122
|
+
self._pricing_cache: dict[str, GPUPricing] | None = None
|
|
123
|
+
|
|
124
|
+
def types(self, refresh: bool = False) -> dict[str, GPUType]:
|
|
125
|
+
"""Get available GPU types"""
|
|
126
|
+
if self._types_cache is None or refresh:
|
|
127
|
+
data = self._http.get("/instances/types")
|
|
128
|
+
self._types_cache = {
|
|
129
|
+
id: GPUType.from_dict(id, info) for id, info in data.items()
|
|
130
|
+
}
|
|
131
|
+
return self._types_cache
|
|
132
|
+
|
|
133
|
+
def regions(self, refresh: bool = False) -> dict[str, Region]:
|
|
134
|
+
"""Get available regions"""
|
|
135
|
+
if self._regions_cache is None or refresh:
|
|
136
|
+
data = self._http.get("/instances/regions")
|
|
137
|
+
self._regions_cache = {
|
|
138
|
+
id: Region.from_dict(id, info) for id, info in data.items()
|
|
139
|
+
}
|
|
140
|
+
return self._regions_cache
|
|
141
|
+
|
|
142
|
+
def pricing(self, refresh: bool = False) -> dict[str, GPUPricing]:
|
|
143
|
+
"""Get pricing information"""
|
|
144
|
+
if self._pricing_cache is None or refresh:
|
|
145
|
+
data = self._http.get("/instances/pricing")
|
|
146
|
+
self._pricing_cache = {
|
|
147
|
+
key: GPUPricing.from_key(key, prices) for key, prices in data.items()
|
|
148
|
+
}
|
|
149
|
+
return self._pricing_cache
|
|
150
|
+
|
|
151
|
+
def get_type(self, gpu_type: str) -> GPUType | None:
|
|
152
|
+
"""Get a specific GPU type by ID"""
|
|
153
|
+
return self.types().get(gpu_type)
|
|
154
|
+
|
|
155
|
+
def get_region(self, region_id: str) -> Region | None:
|
|
156
|
+
"""Get a specific region by ID"""
|
|
157
|
+
return self.regions().get(region_id)
|
|
158
|
+
|
|
159
|
+
def get_price(
|
|
160
|
+
self, gpu_type: str, gpu_count: int = 1, region: str = None, interruptible: bool = True
|
|
161
|
+
) -> float | None:
|
|
162
|
+
"""Get price for a specific GPU configuration"""
|
|
163
|
+
key = f"{gpu_type}_x{gpu_count}"
|
|
164
|
+
pricing = self.pricing().get(key)
|
|
165
|
+
if pricing and region:
|
|
166
|
+
return pricing.get_price(region, interruptible)
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
def list_available(self, gpu_type: str = None, region: str = None) -> list[dict]:
|
|
170
|
+
"""List available GPU configurations, optionally filtered"""
|
|
171
|
+
types = self.types()
|
|
172
|
+
regions = self.regions()
|
|
173
|
+
pricing = self.pricing()
|
|
174
|
+
|
|
175
|
+
results = []
|
|
176
|
+
for type_id, gpu in types.items():
|
|
177
|
+
if gpu_type and type_id != gpu_type:
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
for config in gpu.configs:
|
|
181
|
+
if not config.regions:
|
|
182
|
+
continue
|
|
183
|
+
if region and region not in config.regions:
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
key = f"{type_id}_x{config.gpu_count}"
|
|
187
|
+
gpu_pricing = pricing.get(key)
|
|
188
|
+
|
|
189
|
+
for r in config.regions:
|
|
190
|
+
if region and r != region:
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
region_info = regions.get(r)
|
|
194
|
+
price_info = gpu_pricing.get_price(r, True) if gpu_pricing else None
|
|
195
|
+
on_demand_price = gpu_pricing.get_price(r, False) if gpu_pricing else None
|
|
196
|
+
|
|
197
|
+
results.append({
|
|
198
|
+
"gpu_type": type_id,
|
|
199
|
+
"gpu_name": gpu.name,
|
|
200
|
+
"gpu_count": config.gpu_count,
|
|
201
|
+
"cpu_cores": config.cpu_cores,
|
|
202
|
+
"memory_gb": config.memory_gb,
|
|
203
|
+
"storage_gb": config.storage_gb,
|
|
204
|
+
"region": r,
|
|
205
|
+
"region_name": region_info.description if region_info else r,
|
|
206
|
+
"country": region_info.country if region_info else "",
|
|
207
|
+
"price_spot": price_info,
|
|
208
|
+
"price_on_demand": on_demand_price,
|
|
209
|
+
})
|
|
210
|
+
|
|
211
|
+
return results
|
c3/job/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Job helpers for different GPU workload types"""
|
|
2
|
+
from .base import BaseJob
|
|
3
|
+
from .comfyui import (
|
|
4
|
+
ComfyUIJob,
|
|
5
|
+
apply_params,
|
|
6
|
+
apply_graph_modes,
|
|
7
|
+
find_node,
|
|
8
|
+
find_nodes,
|
|
9
|
+
load_template,
|
|
10
|
+
graph_to_api,
|
|
11
|
+
DEFAULT_OBJECT_INFO,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"BaseJob",
|
|
16
|
+
"ComfyUIJob",
|
|
17
|
+
"apply_params",
|
|
18
|
+
"apply_graph_modes",
|
|
19
|
+
"find_node",
|
|
20
|
+
"find_nodes",
|
|
21
|
+
"load_template",
|
|
22
|
+
"graph_to_api",
|
|
23
|
+
"DEFAULT_OBJECT_INFO",
|
|
24
|
+
]
|
c3/job/base.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Base job class for GPU workloads"""
|
|
2
|
+
import time
|
|
3
|
+
import httpx
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from ..client import C3
|
|
8
|
+
from ..jobs import Job
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseJob:
|
|
12
|
+
"""Base class for managed GPU jobs with lifecycle helpers"""
|
|
13
|
+
|
|
14
|
+
DEFAULT_IMAGE: str = ""
|
|
15
|
+
DEFAULT_GPU_TYPE: str = "l40s"
|
|
16
|
+
HEALTH_ENDPOINT: str = "/"
|
|
17
|
+
HEALTH_TIMEOUT: float = 5.0
|
|
18
|
+
|
|
19
|
+
def __init__(self, c3: "C3", job: "Job"):
|
|
20
|
+
self.c3 = c3
|
|
21
|
+
self.job = job
|
|
22
|
+
self._base_url: str | None = None
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def job_id(self) -> str:
|
|
26
|
+
return self.job.job_id
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def hostname(self) -> str | None:
|
|
30
|
+
return self.job.hostname
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def base_url(self) -> str:
|
|
34
|
+
"""Base URL - only valid after job is running"""
|
|
35
|
+
if not self._base_url and self.hostname:
|
|
36
|
+
self._base_url = f"http://{self.hostname}"
|
|
37
|
+
return self._base_url or ""
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def auth_headers(self) -> dict:
|
|
41
|
+
"""Headers for authenticated requests. Override in subclasses for custom auth."""
|
|
42
|
+
return {"Authorization": f"Bearer {self.c3._api_key}"}
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def get_running(cls, c3: "C3", image_filter: str = None) -> "BaseJob | None":
|
|
46
|
+
"""Find an existing running job, optionally filtering by image"""
|
|
47
|
+
jobs = c3.jobs.list(state="running")
|
|
48
|
+
for job in jobs:
|
|
49
|
+
if image_filter and image_filter not in job.docker_image:
|
|
50
|
+
continue
|
|
51
|
+
return cls(c3, job)
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def get_by_instance(cls, c3: "C3", instance: str, state: str = "running") -> "BaseJob":
|
|
56
|
+
"""Get a job by ID, hostname, or IP address.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
c3: C3 client
|
|
60
|
+
instance: Job ID (UUID), hostname (partial match), or IP address
|
|
61
|
+
state: State filter for hostname/IP search (default: running)
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Job wrapper instance
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
ValueError: If no matching job found
|
|
68
|
+
"""
|
|
69
|
+
from ..jobs import find_job
|
|
70
|
+
|
|
71
|
+
job = find_job(c3.jobs, instance, state=state)
|
|
72
|
+
if not job:
|
|
73
|
+
raise ValueError(f"No job found matching: {instance}")
|
|
74
|
+
return cls(c3, job)
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def create(
|
|
78
|
+
cls,
|
|
79
|
+
c3: "C3",
|
|
80
|
+
image: str = None,
|
|
81
|
+
gpu_type: str = None,
|
|
82
|
+
gpu_count: int = 1,
|
|
83
|
+
runtime: int = 3600,
|
|
84
|
+
**kwargs,
|
|
85
|
+
) -> "BaseJob":
|
|
86
|
+
"""Create a new job"""
|
|
87
|
+
job = c3.jobs.create(
|
|
88
|
+
image=image or cls.DEFAULT_IMAGE,
|
|
89
|
+
gpu_type=gpu_type or cls.DEFAULT_GPU_TYPE,
|
|
90
|
+
gpu_count=gpu_count,
|
|
91
|
+
runtime=runtime,
|
|
92
|
+
**kwargs,
|
|
93
|
+
)
|
|
94
|
+
return cls(c3, job)
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def get_or_create(
|
|
98
|
+
cls,
|
|
99
|
+
c3: "C3",
|
|
100
|
+
image: str = None,
|
|
101
|
+
gpu_type: str = None,
|
|
102
|
+
gpu_count: int = 1,
|
|
103
|
+
runtime: int = 3600,
|
|
104
|
+
reuse: bool = True,
|
|
105
|
+
**kwargs,
|
|
106
|
+
) -> "BaseJob":
|
|
107
|
+
"""Get existing running job or create new one"""
|
|
108
|
+
if reuse:
|
|
109
|
+
existing = cls.get_running(c3, image_filter=image or cls.DEFAULT_IMAGE)
|
|
110
|
+
if existing:
|
|
111
|
+
return existing
|
|
112
|
+
|
|
113
|
+
return cls.create(
|
|
114
|
+
c3,
|
|
115
|
+
image=image,
|
|
116
|
+
gpu_type=gpu_type,
|
|
117
|
+
gpu_count=gpu_count,
|
|
118
|
+
runtime=runtime,
|
|
119
|
+
**kwargs,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def refresh(self) -> "BaseJob":
|
|
123
|
+
"""Refresh job state from API"""
|
|
124
|
+
self.job = self.c3.jobs.get(self.job_id)
|
|
125
|
+
self._base_url = None
|
|
126
|
+
return self
|
|
127
|
+
|
|
128
|
+
def wait_for_running(self, timeout: float = 300, poll_interval: float = 5) -> bool:
|
|
129
|
+
"""Wait for job to reach running state via API polling.
|
|
130
|
+
|
|
131
|
+
IMPORTANT: Do not attempt to connect to hostname until this returns True.
|
|
132
|
+
Connecting before the job is running will cache NXDOMAIN in DNS for ~30 mins.
|
|
133
|
+
"""
|
|
134
|
+
start = time.time()
|
|
135
|
+
while time.time() - start < timeout:
|
|
136
|
+
self.refresh()
|
|
137
|
+
if self.job.state == "running" and self.hostname:
|
|
138
|
+
return True
|
|
139
|
+
if self.job.state in ("failed", "cancelled", "completed", "terminated"):
|
|
140
|
+
return False
|
|
141
|
+
time.sleep(poll_interval)
|
|
142
|
+
return False
|
|
143
|
+
|
|
144
|
+
def wait_for_hostname(self, timeout: float = 300, poll_interval: float = 5) -> bool:
|
|
145
|
+
"""Alias for wait_for_running - waits for job to be running with hostname"""
|
|
146
|
+
return self.wait_for_running(timeout=timeout, poll_interval=poll_interval)
|
|
147
|
+
|
|
148
|
+
def check_health(self) -> bool:
|
|
149
|
+
"""Check if the service is responding.
|
|
150
|
+
|
|
151
|
+
Only call this after job is confirmed running via wait_for_running().
|
|
152
|
+
"""
|
|
153
|
+
if not self.base_url or self.job.state != "running":
|
|
154
|
+
return False
|
|
155
|
+
try:
|
|
156
|
+
from c3.http import request_with_retry
|
|
157
|
+
resp = request_with_retry(
|
|
158
|
+
"get",
|
|
159
|
+
f"{self.base_url}{self.HEALTH_ENDPOINT}",
|
|
160
|
+
headers=self.auth_headers,
|
|
161
|
+
timeout=self.HEALTH_TIMEOUT,
|
|
162
|
+
retries=3,
|
|
163
|
+
)
|
|
164
|
+
return resp.status_code == 200
|
|
165
|
+
except Exception:
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
def wait_existing(self, timeout: float = 15) -> bool:
|
|
169
|
+
"""Wait for an EXISTING running job to respond to health checks.
|
|
170
|
+
|
|
171
|
+
Use when reconnecting to a job that was already running (e.g., service restart,
|
|
172
|
+
reusing an idle instance). No DNS delay needed because the hostname was already
|
|
173
|
+
resolved previously - DNS is cached and propagated.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
timeout: Max seconds to wait for health check (default 15s). Short because
|
|
177
|
+
if an existing healthy job doesn't respond quickly, it's likely dead.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
True if job is running and healthy, False otherwise
|
|
181
|
+
"""
|
|
182
|
+
self.refresh()
|
|
183
|
+
if self.job.state != "running" or not self.hostname:
|
|
184
|
+
return False
|
|
185
|
+
|
|
186
|
+
start = time.time()
|
|
187
|
+
while time.time() - start < timeout:
|
|
188
|
+
if self.check_health():
|
|
189
|
+
return True
|
|
190
|
+
time.sleep(2)
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
def wait_ready(
|
|
194
|
+
self, timeout: float = 300, poll_interval: float = 5, dns_delay: float = 15.0
|
|
195
|
+
) -> bool:
|
|
196
|
+
"""Wait for a NEW job to be ready (running state + health check passing).
|
|
197
|
+
|
|
198
|
+
For newly launched jobs only. Use wait_existing() for jobs already running.
|
|
199
|
+
|
|
200
|
+
Flow:
|
|
201
|
+
1. Poll API until job state = "running" (no DNS lookups yet)
|
|
202
|
+
2. Wait dns_delay seconds for DNS to propagate
|
|
203
|
+
3. Poll health endpoint until service responds
|
|
204
|
+
|
|
205
|
+
Why dns_delay? New jobs get fresh hostnames. If we hit DNS before it propagates,
|
|
206
|
+
we get NXDOMAIN which gets cached by the resolver for ~30 minutes, breaking all
|
|
207
|
+
subsequent requests. The 15s delay lets DNS propagate first. ComfyUI takes >30s
|
|
208
|
+
to boot anyway, so this doesn't add latency.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
timeout: Total max seconds to wait (default 300s). Includes API polling +
|
|
212
|
+
DNS delay + health checks.
|
|
213
|
+
poll_interval: Seconds between API state checks (default 5s).
|
|
214
|
+
dns_delay: Seconds to wait after API says "running" before first DNS lookup
|
|
215
|
+
(default 15s). Prevents NXDOMAIN caching on new hostnames.
|
|
216
|
+
"""
|
|
217
|
+
start = time.time()
|
|
218
|
+
|
|
219
|
+
# Wait for running state via API (no DNS calls yet)
|
|
220
|
+
self.refresh()
|
|
221
|
+
if self.job.state in ("failed", "cancelled", "completed", "terminated"):
|
|
222
|
+
return False
|
|
223
|
+
|
|
224
|
+
if self.job.state != "running" or not self.hostname:
|
|
225
|
+
if not self.wait_for_running(timeout=timeout, poll_interval=poll_interval):
|
|
226
|
+
return False
|
|
227
|
+
|
|
228
|
+
# DNS propagation delay - wait before first hostname lookup to avoid NXDOMAIN caching
|
|
229
|
+
if dns_delay > 0:
|
|
230
|
+
time.sleep(dns_delay)
|
|
231
|
+
|
|
232
|
+
# Job is running, check health with shorter interval
|
|
233
|
+
elapsed = time.time() - start
|
|
234
|
+
remaining = timeout - elapsed
|
|
235
|
+
while remaining > 0:
|
|
236
|
+
if self.check_health():
|
|
237
|
+
return True
|
|
238
|
+
time.sleep(2) # Shorter interval for health checks
|
|
239
|
+
remaining = timeout - (time.time() - start)
|
|
240
|
+
return False
|
|
241
|
+
|
|
242
|
+
def shutdown(self) -> dict:
|
|
243
|
+
"""Cancel the job"""
|
|
244
|
+
return self.c3.jobs.cancel(self.job_id)
|
|
245
|
+
|
|
246
|
+
def extend(self, runtime: int) -> "BaseJob":
|
|
247
|
+
"""Extend job runtime"""
|
|
248
|
+
self.job = self.c3.jobs.extend(self.job_id, runtime=runtime)
|
|
249
|
+
return self
|