tetra-rp 0.6.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tetra_rp/__init__.py +109 -19
- tetra_rp/cli/commands/__init__.py +1 -0
- tetra_rp/cli/commands/apps.py +143 -0
- tetra_rp/cli/commands/build.py +1082 -0
- tetra_rp/cli/commands/build_utils/__init__.py +1 -0
- tetra_rp/cli/commands/build_utils/handler_generator.py +176 -0
- tetra_rp/cli/commands/build_utils/lb_handler_generator.py +309 -0
- tetra_rp/cli/commands/build_utils/manifest.py +430 -0
- tetra_rp/cli/commands/build_utils/mothership_handler_generator.py +75 -0
- tetra_rp/cli/commands/build_utils/scanner.py +596 -0
- tetra_rp/cli/commands/deploy.py +580 -0
- tetra_rp/cli/commands/init.py +123 -0
- tetra_rp/cli/commands/resource.py +108 -0
- tetra_rp/cli/commands/run.py +296 -0
- tetra_rp/cli/commands/test_mothership.py +458 -0
- tetra_rp/cli/commands/undeploy.py +533 -0
- tetra_rp/cli/main.py +97 -0
- tetra_rp/cli/utils/__init__.py +1 -0
- tetra_rp/cli/utils/app.py +15 -0
- tetra_rp/cli/utils/conda.py +127 -0
- tetra_rp/cli/utils/deployment.py +530 -0
- tetra_rp/cli/utils/ignore.py +143 -0
- tetra_rp/cli/utils/skeleton.py +184 -0
- tetra_rp/cli/utils/skeleton_template/.env.example +4 -0
- tetra_rp/cli/utils/skeleton_template/.flashignore +40 -0
- tetra_rp/cli/utils/skeleton_template/.gitignore +44 -0
- tetra_rp/cli/utils/skeleton_template/README.md +263 -0
- tetra_rp/cli/utils/skeleton_template/main.py +44 -0
- tetra_rp/cli/utils/skeleton_template/mothership.py +55 -0
- tetra_rp/cli/utils/skeleton_template/pyproject.toml +58 -0
- tetra_rp/cli/utils/skeleton_template/requirements.txt +1 -0
- tetra_rp/cli/utils/skeleton_template/workers/__init__.py +0 -0
- tetra_rp/cli/utils/skeleton_template/workers/cpu/__init__.py +19 -0
- tetra_rp/cli/utils/skeleton_template/workers/cpu/endpoint.py +36 -0
- tetra_rp/cli/utils/skeleton_template/workers/gpu/__init__.py +19 -0
- tetra_rp/cli/utils/skeleton_template/workers/gpu/endpoint.py +61 -0
- tetra_rp/client.py +136 -33
- tetra_rp/config.py +29 -0
- tetra_rp/core/api/runpod.py +591 -39
- tetra_rp/core/deployment.py +232 -0
- tetra_rp/core/discovery.py +425 -0
- tetra_rp/core/exceptions.py +50 -0
- tetra_rp/core/resources/__init__.py +27 -9
- tetra_rp/core/resources/app.py +738 -0
- tetra_rp/core/resources/base.py +139 -4
- tetra_rp/core/resources/constants.py +21 -0
- tetra_rp/core/resources/cpu.py +115 -13
- tetra_rp/core/resources/gpu.py +182 -16
- tetra_rp/core/resources/live_serverless.py +153 -16
- tetra_rp/core/resources/load_balancer_sls_resource.py +440 -0
- tetra_rp/core/resources/network_volume.py +126 -31
- tetra_rp/core/resources/resource_manager.py +436 -35
- tetra_rp/core/resources/serverless.py +537 -120
- tetra_rp/core/resources/serverless_cpu.py +201 -0
- tetra_rp/core/resources/template.py +1 -59
- tetra_rp/core/utils/constants.py +10 -0
- tetra_rp/core/utils/file_lock.py +260 -0
- tetra_rp/core/utils/http.py +67 -0
- tetra_rp/core/utils/lru_cache.py +75 -0
- tetra_rp/core/utils/singleton.py +36 -1
- tetra_rp/core/validation.py +44 -0
- tetra_rp/execute_class.py +301 -0
- tetra_rp/protos/remote_execution.py +98 -9
- tetra_rp/runtime/__init__.py +1 -0
- tetra_rp/runtime/circuit_breaker.py +274 -0
- tetra_rp/runtime/config.py +12 -0
- tetra_rp/runtime/exceptions.py +49 -0
- tetra_rp/runtime/generic_handler.py +206 -0
- tetra_rp/runtime/lb_handler.py +189 -0
- tetra_rp/runtime/load_balancer.py +160 -0
- tetra_rp/runtime/manifest_fetcher.py +192 -0
- tetra_rp/runtime/metrics.py +325 -0
- tetra_rp/runtime/models.py +73 -0
- tetra_rp/runtime/mothership_provisioner.py +512 -0
- tetra_rp/runtime/production_wrapper.py +266 -0
- tetra_rp/runtime/reliability_config.py +149 -0
- tetra_rp/runtime/retry_manager.py +118 -0
- tetra_rp/runtime/serialization.py +124 -0
- tetra_rp/runtime/service_registry.py +346 -0
- tetra_rp/runtime/state_manager_client.py +248 -0
- tetra_rp/stubs/live_serverless.py +35 -17
- tetra_rp/stubs/load_balancer_sls.py +357 -0
- tetra_rp/stubs/registry.py +145 -19
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/METADATA +398 -60
- tetra_rp-0.24.0.dist-info/RECORD +99 -0
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/WHEEL +1 -1
- tetra_rp-0.24.0.dist-info/entry_points.txt +2 -0
- tetra_rp/core/pool/cluster_manager.py +0 -177
- tetra_rp/core/pool/dataclass.py +0 -18
- tetra_rp/core/pool/ex.py +0 -38
- tetra_rp/core/pool/job.py +0 -22
- tetra_rp/core/pool/worker.py +0 -19
- tetra_rp/core/resources/utils.py +0 -50
- tetra_rp/core/utils/json.py +0 -33
- tetra_rp-0.6.0.dist-info/RECORD +0 -39
- /tetra_rp/{core/pool → cli}/__init__.py +0 -0
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/top_level.txt +0 -0
tetra_rp/core/api/runpod.py
CHANGED
|
@@ -3,17 +3,53 @@ Direct GraphQL communication with Runpod API.
|
|
|
3
3
|
Bypasses the outdated runpod-python SDK limitations.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import os
|
|
7
6
|
import json
|
|
8
|
-
import aiohttp
|
|
9
|
-
from typing import Dict, Any, Optional
|
|
10
7
|
import logging
|
|
8
|
+
import os
|
|
9
|
+
from typing import Any, Dict, Optional, List
|
|
10
|
+
|
|
11
|
+
import aiohttp
|
|
12
|
+
from aiohttp.resolver import ThreadedResolver
|
|
13
|
+
|
|
14
|
+
from tetra_rp.core.exceptions import RunpodAPIKeyError
|
|
15
|
+
from tetra_rp.runtime.exceptions import GraphQLMutationError, GraphQLQueryError
|
|
11
16
|
|
|
12
17
|
log = logging.getLogger(__name__)
|
|
13
18
|
|
|
14
19
|
RUNPOD_API_BASE_URL = os.environ.get("RUNPOD_API_BASE_URL", "https://api.runpod.io")
|
|
15
20
|
RUNPOD_REST_API_URL = os.environ.get("RUNPOD_REST_API_URL", "https://rest.runpod.io/v1")
|
|
16
21
|
|
|
22
|
+
# Sensitive fields that should be redacted from logs (pre-signed URLs, tokens, etc.)
|
|
23
|
+
SENSITIVE_FIELDS = {"uploadUrl", "downloadUrl", "presignedUrl"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _sanitize_for_logging(data: Any, redaction_text: str = "<REDACTED>") -> Any:
|
|
27
|
+
"""Recursively sanitize sensitive fields from data structures before logging.
|
|
28
|
+
|
|
29
|
+
Pre-signed URLs and other sensitive fields should not be logged as they
|
|
30
|
+
are temporary credentials that could be misused if exposed.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
data: Data structure to sanitize (dict, list, or primitive)
|
|
34
|
+
redaction_text: Text to replace sensitive values with
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Sanitized copy of the data structure
|
|
38
|
+
"""
|
|
39
|
+
if isinstance(data, dict):
|
|
40
|
+
return {
|
|
41
|
+
key: (
|
|
42
|
+
redaction_text
|
|
43
|
+
if key in SENSITIVE_FIELDS
|
|
44
|
+
else _sanitize_for_logging(value, redaction_text)
|
|
45
|
+
)
|
|
46
|
+
for key, value in data.items()
|
|
47
|
+
}
|
|
48
|
+
elif isinstance(data, list):
|
|
49
|
+
return [_sanitize_for_logging(item, redaction_text) for item in data]
|
|
50
|
+
else:
|
|
51
|
+
return data
|
|
52
|
+
|
|
17
53
|
|
|
18
54
|
class RunpodGraphQLClient:
|
|
19
55
|
"""
|
|
@@ -26,7 +62,7 @@ class RunpodGraphQLClient:
|
|
|
26
62
|
def __init__(self, api_key: Optional[str] = None):
|
|
27
63
|
self.api_key = api_key or os.getenv("RUNPOD_API_KEY")
|
|
28
64
|
if not self.api_key:
|
|
29
|
-
raise
|
|
65
|
+
raise RunpodAPIKeyError()
|
|
30
66
|
|
|
31
67
|
self.session: Optional[aiohttp.ClientSession] = None
|
|
32
68
|
|
|
@@ -34,12 +70,14 @@ class RunpodGraphQLClient:
|
|
|
34
70
|
"""Get or create an aiohttp session."""
|
|
35
71
|
if self.session is None or self.session.closed:
|
|
36
72
|
timeout = aiohttp.ClientTimeout(total=300) # 5 minute timeout
|
|
73
|
+
connector = aiohttp.TCPConnector(resolver=ThreadedResolver())
|
|
37
74
|
self.session = aiohttp.ClientSession(
|
|
38
75
|
timeout=timeout,
|
|
39
76
|
headers={
|
|
40
77
|
"Authorization": f"Bearer {self.api_key}",
|
|
41
78
|
"Content-Type": "application/json",
|
|
42
79
|
},
|
|
80
|
+
connector=connector,
|
|
43
81
|
)
|
|
44
82
|
return self.session
|
|
45
83
|
|
|
@@ -52,23 +90,31 @@ class RunpodGraphQLClient:
|
|
|
52
90
|
payload = {"query": query, "variables": variables or {}}
|
|
53
91
|
|
|
54
92
|
log.debug(f"GraphQL Query: {query}")
|
|
55
|
-
|
|
93
|
+
sanitized_vars = _sanitize_for_logging(variables)
|
|
94
|
+
log.debug(f"GraphQL Variables: {json.dumps(sanitized_vars, indent=2)}")
|
|
56
95
|
|
|
57
96
|
try:
|
|
58
97
|
async with session.post(self.GRAPHQL_URL, json=payload) as response:
|
|
59
98
|
response_data = await response.json()
|
|
60
99
|
|
|
61
100
|
log.debug(f"GraphQL Response Status: {response.status}")
|
|
62
|
-
|
|
101
|
+
sanitized_response = _sanitize_for_logging(response_data)
|
|
102
|
+
log.debug(
|
|
103
|
+
f"GraphQL Response: {json.dumps(sanitized_response, indent=2)}"
|
|
104
|
+
)
|
|
63
105
|
|
|
64
106
|
if response.status >= 400:
|
|
107
|
+
sanitized_err = _sanitize_for_logging(response_data)
|
|
65
108
|
raise Exception(
|
|
66
|
-
f"GraphQL request failed: {response.status} - {
|
|
109
|
+
f"GraphQL request failed: {response.status} - {sanitized_err}"
|
|
67
110
|
)
|
|
68
111
|
|
|
69
112
|
if "errors" in response_data:
|
|
70
113
|
errors = response_data["errors"]
|
|
71
|
-
|
|
114
|
+
sanitized_errors = _sanitize_for_logging(errors)
|
|
115
|
+
error_msg = "; ".join(
|
|
116
|
+
[e.get("message", str(e)) for e in sanitized_errors]
|
|
117
|
+
)
|
|
72
118
|
raise Exception(f"GraphQL errors: {error_msg}")
|
|
73
119
|
|
|
74
120
|
return response_data.get("data", {})
|
|
@@ -77,9 +123,10 @@ class RunpodGraphQLClient:
|
|
|
77
123
|
log.error(f"HTTP client error: {e}")
|
|
78
124
|
raise Exception(f"HTTP request failed: {e}")
|
|
79
125
|
|
|
80
|
-
async def
|
|
126
|
+
async def save_endpoint(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
81
127
|
"""
|
|
82
|
-
Create a serverless endpoint using direct GraphQL mutation.
|
|
128
|
+
Create or update a serverless endpoint using direct GraphQL mutation.
|
|
129
|
+
When 'id' is included in the input, updates the existing endpoint.
|
|
83
130
|
Supports both GPU and CPU endpoints with full field support.
|
|
84
131
|
"""
|
|
85
132
|
# GraphQL mutation for saveEndpoint (based on actual schema)
|
|
@@ -93,6 +140,7 @@ class RunpodGraphQLClient:
|
|
|
93
140
|
locations
|
|
94
141
|
name
|
|
95
142
|
networkVolumeId
|
|
143
|
+
flashEnvironmentId
|
|
96
144
|
scalerType
|
|
97
145
|
scalerValue
|
|
98
146
|
templateId
|
|
@@ -115,9 +163,7 @@ class RunpodGraphQLClient:
|
|
|
115
163
|
|
|
116
164
|
variables = {"input": input_data}
|
|
117
165
|
|
|
118
|
-
log.debug(
|
|
119
|
-
f"Creating endpoint with GraphQL: {input_data.get('name', 'unnamed')}"
|
|
120
|
-
)
|
|
166
|
+
log.debug(f"Saving endpoint with GraphQL: {input_data.get('name', 'unnamed')}")
|
|
121
167
|
|
|
122
168
|
result = await self._execute_graphql(mutation, variables)
|
|
123
169
|
|
|
@@ -126,7 +172,7 @@ class RunpodGraphQLClient:
|
|
|
126
172
|
|
|
127
173
|
endpoint_data = result["saveEndpoint"]
|
|
128
174
|
log.info(
|
|
129
|
-
f"
|
|
175
|
+
f"Saved endpoint: {endpoint_data.get('id', 'unknown')} - {endpoint_data.get('name', 'unnamed')}"
|
|
130
176
|
)
|
|
131
177
|
|
|
132
178
|
return endpoint_data
|
|
@@ -199,7 +245,508 @@ class RunpodGraphQLClient:
|
|
|
199
245
|
log.info(f"Deleting endpoint: {endpoint_id}")
|
|
200
246
|
|
|
201
247
|
result = await self._execute_graphql(mutation, variables)
|
|
202
|
-
|
|
248
|
+
|
|
249
|
+
# If _execute_graphql didn't raise an exception, the deletion succeeded.
|
|
250
|
+
# The GraphQL mutation returns null on success, but presence of the key
|
|
251
|
+
# (even with null value) indicates the mutation executed.
|
|
252
|
+
# If the mutation failed, _execute_graphql would have raised an exception.
|
|
253
|
+
|
|
254
|
+
return {"success": "deleteEndpoint" in result}
|
|
255
|
+
|
|
256
|
+
async def list_flash_apps(self) -> List[Dict]:
|
|
257
|
+
"""
|
|
258
|
+
List all flash apps in Runpod.
|
|
259
|
+
"""
|
|
260
|
+
log.debug("Listing Flash apps")
|
|
261
|
+
query = """
|
|
262
|
+
query getFlashApps {
|
|
263
|
+
myself {
|
|
264
|
+
flashApps {
|
|
265
|
+
id
|
|
266
|
+
name
|
|
267
|
+
flashEnvironments {
|
|
268
|
+
id
|
|
269
|
+
name
|
|
270
|
+
state
|
|
271
|
+
createdAt
|
|
272
|
+
activeBuildId
|
|
273
|
+
}
|
|
274
|
+
flashBuilds {
|
|
275
|
+
id
|
|
276
|
+
createdAt
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
"""
|
|
282
|
+
|
|
283
|
+
result = await self._execute_graphql(query)
|
|
284
|
+
return result["myself"].get("flashApps", [])
|
|
285
|
+
|
|
286
|
+
async def prepare_artifact_upload(
|
|
287
|
+
self, input_data: Dict[str, Any]
|
|
288
|
+
) -> Dict[str, Any]:
|
|
289
|
+
mutation = """
|
|
290
|
+
mutation PrepareArtifactUpload($input: PrepareFlashArtifactUploadInput!) {
|
|
291
|
+
prepareFlashArtifactUpload(input: $input) {
|
|
292
|
+
uploadUrl
|
|
293
|
+
objectKey
|
|
294
|
+
expiresAt
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
"""
|
|
298
|
+
variables = {"input": input_data}
|
|
299
|
+
|
|
300
|
+
log.debug(f"Preparing upload url for flash environment: {input_data}")
|
|
301
|
+
|
|
302
|
+
result = await self._execute_graphql(mutation, variables)
|
|
303
|
+
return result["prepareFlashArtifactUpload"]
|
|
304
|
+
|
|
305
|
+
async def finalize_artifact_upload(
|
|
306
|
+
self, input_data: Dict[str, Any]
|
|
307
|
+
) -> Dict[str, Any]:
|
|
308
|
+
mutation = """
|
|
309
|
+
mutation FinalizeArtifactUpload($input: FinalizeFlashArtifactUploadInput!) {
|
|
310
|
+
finalizeFlashArtifactUpload(input: $input) {
|
|
311
|
+
id
|
|
312
|
+
manifest
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
"""
|
|
316
|
+
variables = {"input": input_data}
|
|
317
|
+
|
|
318
|
+
log.debug(f"finalizing upload for flash app: {input_data}")
|
|
319
|
+
|
|
320
|
+
result = await self._execute_graphql(mutation, variables)
|
|
321
|
+
return result["finalizeFlashArtifactUpload"]
|
|
322
|
+
|
|
323
|
+
async def get_flash_app(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
324
|
+
query = """
|
|
325
|
+
query getFlashApp($input: String!) {
|
|
326
|
+
flashApp(flashAppId: $input) {
|
|
327
|
+
id
|
|
328
|
+
name
|
|
329
|
+
flashEnvironments {
|
|
330
|
+
id
|
|
331
|
+
name
|
|
332
|
+
state
|
|
333
|
+
}
|
|
334
|
+
flashBuilds {
|
|
335
|
+
id
|
|
336
|
+
objectKey
|
|
337
|
+
createdAt
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
"""
|
|
342
|
+
variables = {"input": input_data}
|
|
343
|
+
|
|
344
|
+
log.debug(f"Fetching flash app for input: {input_data}")
|
|
345
|
+
result = await self._execute_graphql(query, variables)
|
|
346
|
+
return result["flashApp"]
|
|
347
|
+
|
|
348
|
+
async def get_flash_app_by_name(self, app_name: str) -> Dict[str, Any]:
|
|
349
|
+
query = """
|
|
350
|
+
query getFlashAppByName($flashAppName: String!) {
|
|
351
|
+
flashAppByName(flashAppName: $flashAppName) {
|
|
352
|
+
id
|
|
353
|
+
name
|
|
354
|
+
flashEnvironments {
|
|
355
|
+
id
|
|
356
|
+
name
|
|
357
|
+
state
|
|
358
|
+
activeBuildId
|
|
359
|
+
createdAt
|
|
360
|
+
}
|
|
361
|
+
flashBuilds {
|
|
362
|
+
id
|
|
363
|
+
objectKey
|
|
364
|
+
createdAt
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
"""
|
|
369
|
+
variables = {"flashAppName": app_name}
|
|
370
|
+
|
|
371
|
+
log.debug(f"Fetching flash app by name for input: {app_name}")
|
|
372
|
+
result = await self._execute_graphql(query, variables)
|
|
373
|
+
return result["flashAppByName"]
|
|
374
|
+
|
|
375
|
+
async def get_flash_environment(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
376
|
+
query = """
|
|
377
|
+
query getFlashEnvironment($flashEnvironmentId: String!) {
|
|
378
|
+
flashEnvironment(flashEnvironmentId: $flashEnvironmentId) {
|
|
379
|
+
id
|
|
380
|
+
name
|
|
381
|
+
state
|
|
382
|
+
activeBuildId
|
|
383
|
+
createdAt
|
|
384
|
+
endpoints {
|
|
385
|
+
id
|
|
386
|
+
name
|
|
387
|
+
}
|
|
388
|
+
networkVolumes {
|
|
389
|
+
id
|
|
390
|
+
name
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
"""
|
|
395
|
+
variables = {**input_data}
|
|
396
|
+
|
|
397
|
+
log.debug(f"Fetching flash environment for input: {variables}")
|
|
398
|
+
result = await self._execute_graphql(query, variables)
|
|
399
|
+
return result["flashEnvironment"]
|
|
400
|
+
|
|
401
|
+
async def get_flash_environment_by_name(
|
|
402
|
+
self, input_data: Dict[str, Any]
|
|
403
|
+
) -> Dict[str, Any]:
|
|
404
|
+
query = """
|
|
405
|
+
query getFlashEnvironmentByName($input: FlashEnvironmentByNameInput!) {
|
|
406
|
+
flashEnvironmentByName(input: $input) {
|
|
407
|
+
id
|
|
408
|
+
name
|
|
409
|
+
state
|
|
410
|
+
activeBuildId
|
|
411
|
+
endpoints {
|
|
412
|
+
id
|
|
413
|
+
name
|
|
414
|
+
}
|
|
415
|
+
networkVolumes {
|
|
416
|
+
id
|
|
417
|
+
name
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
"""
|
|
422
|
+
variables = {"input": input_data}
|
|
423
|
+
|
|
424
|
+
log.debug(f"Fetching flash environment by name for input: {variables}")
|
|
425
|
+
result = await self._execute_graphql(query, variables)
|
|
426
|
+
|
|
427
|
+
return result["flashEnvironmentByName"]
|
|
428
|
+
|
|
429
|
+
async def update_build_manifest(
|
|
430
|
+
self,
|
|
431
|
+
build_id: str,
|
|
432
|
+
manifest: Dict[str, Any],
|
|
433
|
+
) -> None:
|
|
434
|
+
mutation = """
|
|
435
|
+
mutation updateFlashBuildManifest($input: UpdateFlashBuildManifestInput!) {
|
|
436
|
+
updateFlashBuildManifest(input: $input) {
|
|
437
|
+
id
|
|
438
|
+
manifest
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
"""
|
|
442
|
+
variables = {"input": {"flashBuildId": build_id, "manifest": manifest}}
|
|
443
|
+
result = await self._execute_graphql(mutation, variables)
|
|
444
|
+
|
|
445
|
+
if "updateFlashBuildManifest" not in result:
|
|
446
|
+
raise GraphQLMutationError(
|
|
447
|
+
f"updateFlashBuildManifest mutation failed for build {build_id}. "
|
|
448
|
+
f"Expected 'updateFlashBuildManifest' in response, got: {list(result.keys())}"
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
async def get_flash_artifact_url(self, environment_id: str) -> Dict[str, Any]:
|
|
452
|
+
result = await self.get_flash_environment(
|
|
453
|
+
{"flashEnvironmentId": environment_id}
|
|
454
|
+
)
|
|
455
|
+
return result
|
|
456
|
+
|
|
457
|
+
async def deploy_build_to_environment(
|
|
458
|
+
self, input_data: Dict[str, Any]
|
|
459
|
+
) -> Dict[str, Any]:
|
|
460
|
+
# TODO(jhcipar) should we not generate a presigned url when promoting a build here?
|
|
461
|
+
mutation = """
|
|
462
|
+
mutation deployBuildToEnvironment($input: DeployBuildToEnvironmentInput!) {
|
|
463
|
+
deployBuildToEnvironment(input: $input) {
|
|
464
|
+
id
|
|
465
|
+
name
|
|
466
|
+
activeArtifact {
|
|
467
|
+
objectKey
|
|
468
|
+
downloadUrl
|
|
469
|
+
expiresAt
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
"""
|
|
474
|
+
|
|
475
|
+
variables = {"input": input_data}
|
|
476
|
+
|
|
477
|
+
log.debug(f"Deploying flash environment with vars: {input_data}")
|
|
478
|
+
|
|
479
|
+
result = await self._execute_graphql(mutation, variables)
|
|
480
|
+
return result["deployBuildToEnvironment"]
|
|
481
|
+
|
|
482
|
+
async def create_flash_app(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
483
|
+
"""Create a new flash app in Runpod."""
|
|
484
|
+
log.debug(f"creating flash app with name {input_data.get('name')}")
|
|
485
|
+
|
|
486
|
+
mutation = """
|
|
487
|
+
mutation createFlashApp($input: CreateFlashAppInput!) {
|
|
488
|
+
createFlashApp(input: $input) {
|
|
489
|
+
id
|
|
490
|
+
name
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
"""
|
|
494
|
+
|
|
495
|
+
variables = {"input": input_data}
|
|
496
|
+
|
|
497
|
+
log.debug(
|
|
498
|
+
f"Creating flash app with GraphQL: {input_data.get('name', 'unnamed')}"
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
result = await self._execute_graphql(mutation, variables)
|
|
502
|
+
|
|
503
|
+
return result["createFlashApp"]
|
|
504
|
+
|
|
505
|
+
async def create_flash_environment(
|
|
506
|
+
self, input_data: Dict[str, Any]
|
|
507
|
+
) -> Dict[str, Any]:
|
|
508
|
+
"""Create an environment within a flash app."""
|
|
509
|
+
log.debug(f"creating flash environment with name {input_data.get('name')}")
|
|
510
|
+
|
|
511
|
+
mutation = """
|
|
512
|
+
mutation createFlashEnvironment($input: CreateFlashEnvironmentInput!) {
|
|
513
|
+
createFlashEnvironment(input: $input) {
|
|
514
|
+
id
|
|
515
|
+
name
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
"""
|
|
519
|
+
|
|
520
|
+
variables = {"input": input_data}
|
|
521
|
+
|
|
522
|
+
log.debug(
|
|
523
|
+
f"Creating flash environment with GraphQL: {input_data.get('name', 'unnamed')}"
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
result = await self._execute_graphql(mutation, variables)
|
|
527
|
+
|
|
528
|
+
return result["createFlashEnvironment"]
|
|
529
|
+
|
|
530
|
+
async def register_endpoint_to_environment(
|
|
531
|
+
self, input_data: Dict[str, Any]
|
|
532
|
+
) -> Dict[str, Any]:
|
|
533
|
+
"""Register an endpoint to a Flash environment"""
|
|
534
|
+
|
|
535
|
+
log.debug(
|
|
536
|
+
f"Registering endpoint to flash environment with input data: {input_data}"
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
mutation = """
|
|
540
|
+
mutation addEndpointToFlashEnvironment($input: AddEndpointToEnvironmentInput!) {
|
|
541
|
+
addEndpointToFlashEnvironment(input: $input) {
|
|
542
|
+
id
|
|
543
|
+
name
|
|
544
|
+
flashEnvironmentId
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
"""
|
|
548
|
+
|
|
549
|
+
variables = {"input": input_data}
|
|
550
|
+
|
|
551
|
+
result = await self._execute_graphql(mutation, variables)
|
|
552
|
+
|
|
553
|
+
return result["addEndpointToFlashEnvironment"]
|
|
554
|
+
|
|
555
|
+
async def register_network_volume_to_environment(
|
|
556
|
+
self, input_data: Dict[str, Any]
|
|
557
|
+
) -> Dict[str, Any]:
|
|
558
|
+
"""Register an endpoint to a Flash environment"""
|
|
559
|
+
|
|
560
|
+
log.debug(
|
|
561
|
+
f"Registering endpoint to flash environment with input data: {input_data}"
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
mutation = """
|
|
565
|
+
mutation addNetworkVolumeToFlashEnvironment($input: AddNetworkVolumeToEnvironmentInput!) {
|
|
566
|
+
addNetworkVolumeToFlashEnvironment(input: $input) {
|
|
567
|
+
id
|
|
568
|
+
name
|
|
569
|
+
flashEnvironmentId
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
"""
|
|
573
|
+
|
|
574
|
+
variables = {"input": input_data}
|
|
575
|
+
|
|
576
|
+
result = await self._execute_graphql(mutation, variables)
|
|
577
|
+
|
|
578
|
+
return result["addNetworkVolumeToFlashEnvironment"]
|
|
579
|
+
|
|
580
|
+
async def set_environment_state(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
581
|
+
log.debug(f"Setting Flash environment status with input data: {input_data}")
|
|
582
|
+
|
|
583
|
+
mutation = """
|
|
584
|
+
mutation updateFlashEnvironment($input: UpdateFlashEnvironmentInput!) {
|
|
585
|
+
updateFlashEnvironment(input: $input) {
|
|
586
|
+
id
|
|
587
|
+
name
|
|
588
|
+
state
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
"""
|
|
592
|
+
|
|
593
|
+
variables = {"input": input_data}
|
|
594
|
+
|
|
595
|
+
result = await self._execute_graphql(mutation, variables)
|
|
596
|
+
|
|
597
|
+
return result["updateFlashEnvironment"]
|
|
598
|
+
|
|
599
|
+
async def get_flash_build(self, build_id: str) -> Dict[str, Any]:
|
|
600
|
+
"""Fetch flash build by ID.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
build_id: Build ID string (UUID format).
|
|
604
|
+
|
|
605
|
+
Returns:
|
|
606
|
+
Build data including id and manifest.
|
|
607
|
+
|
|
608
|
+
Raises:
|
|
609
|
+
TypeError: If build_id is not a string.
|
|
610
|
+
GraphQLQueryError: If build not found or query fails.
|
|
611
|
+
|
|
612
|
+
Note:
|
|
613
|
+
API changed in PR #144:
|
|
614
|
+
- Previously accepted Dict[str, Any], now requires string build_id directly
|
|
615
|
+
- Query now requests 'manifest' field instead of 'name' field
|
|
616
|
+
"""
|
|
617
|
+
if not isinstance(build_id, str):
|
|
618
|
+
raise TypeError(
|
|
619
|
+
f"get_flash_build() expects build_id as str, got {type(build_id).__name__}. "
|
|
620
|
+
f"API changed in PR #144 - update caller to pass build_id string directly."
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
query = """
|
|
624
|
+
query getFlashBuild($input: String!) {
|
|
625
|
+
flashBuild(flashBuildId: $input) {
|
|
626
|
+
id
|
|
627
|
+
manifest
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
"""
|
|
631
|
+
variables = {"input": build_id}
|
|
632
|
+
|
|
633
|
+
log.debug(f"Fetching flash build for input: {build_id}")
|
|
634
|
+
result = await self._execute_graphql(query, variables)
|
|
635
|
+
|
|
636
|
+
if "flashBuild" not in result:
|
|
637
|
+
raise GraphQLQueryError(
|
|
638
|
+
f"get_flash_build query failed for build {build_id}. "
|
|
639
|
+
f"Expected 'flashBuild' in response, got: {list(result.keys())}"
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
return result["flashBuild"]
|
|
643
|
+
|
|
644
|
+
async def list_flash_builds_by_app_id(self, app_id: str) -> List[Dict[str, Any]]:
|
|
645
|
+
"""List all builds for a flash app by app ID (optimized query).
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
app_id: The flash app ID
|
|
649
|
+
|
|
650
|
+
Returns:
|
|
651
|
+
List of build dictionaries with id, objectKey, createdAt fields
|
|
652
|
+
"""
|
|
653
|
+
query = """
|
|
654
|
+
query listFlashBuilds($flashAppId: String!) {
|
|
655
|
+
flashApp(flashAppId: $flashAppId) {
|
|
656
|
+
flashBuilds {
|
|
657
|
+
id
|
|
658
|
+
objectKey
|
|
659
|
+
createdAt
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
"""
|
|
664
|
+
variables = {"flashAppId": app_id}
|
|
665
|
+
|
|
666
|
+
log.debug(f"Listing flash builds for app: {app_id}")
|
|
667
|
+
result = await self._execute_graphql(query, variables)
|
|
668
|
+
return result["flashApp"]["flashBuilds"]
|
|
669
|
+
|
|
670
|
+
async def list_flash_environments_by_app_id(
|
|
671
|
+
self, app_id: str
|
|
672
|
+
) -> List[Dict[str, Any]]:
|
|
673
|
+
"""List all environments for a flash app by app ID (optimized query).
|
|
674
|
+
|
|
675
|
+
Args:
|
|
676
|
+
app_id: The flash app ID
|
|
677
|
+
|
|
678
|
+
Returns:
|
|
679
|
+
List of environment dictionaries with id, name, state, activeBuildId, createdAt fields
|
|
680
|
+
"""
|
|
681
|
+
query = """
|
|
682
|
+
query listFlashEnvironments($flashAppId: String!) {
|
|
683
|
+
flashApp(flashAppId: $flashAppId) {
|
|
684
|
+
flashEnvironments {
|
|
685
|
+
id
|
|
686
|
+
name
|
|
687
|
+
state
|
|
688
|
+
activeBuildId
|
|
689
|
+
createdAt
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
"""
|
|
694
|
+
variables = {"flashAppId": app_id}
|
|
695
|
+
|
|
696
|
+
log.debug(f"Listing flash environments for app: {app_id}")
|
|
697
|
+
result = await self._execute_graphql(query, variables)
|
|
698
|
+
return result["flashApp"]["flashEnvironments"]
|
|
699
|
+
|
|
700
|
+
async def delete_flash_app(self, app_id: str) -> Dict[str, Any]:
|
|
701
|
+
mutation = """
|
|
702
|
+
mutation deleteFlashApp($flashAppId: String!) {
|
|
703
|
+
deleteFlashApp(flashAppId: $flashAppId)
|
|
704
|
+
}
|
|
705
|
+
"""
|
|
706
|
+
|
|
707
|
+
variables = {"flashAppId": app_id}
|
|
708
|
+
log.info(f"Deleting flash app: {app_id}")
|
|
709
|
+
|
|
710
|
+
result = await self._execute_graphql(mutation, variables)
|
|
711
|
+
return {"success": "deleteFlashApp" in result}
|
|
712
|
+
|
|
713
|
+
async def delete_flash_environment(self, environment_id: str) -> Dict[str, Any]:
|
|
714
|
+
"""Delete a flash environment."""
|
|
715
|
+
mutation = """
|
|
716
|
+
mutation deleteFlashEnvironment($flashEnvironmentId: String!) {
|
|
717
|
+
deleteFlashEnvironment(flashEnvironmentId: $flashEnvironmentId)
|
|
718
|
+
}
|
|
719
|
+
"""
|
|
720
|
+
|
|
721
|
+
variables = {"flashEnvironmentId": environment_id}
|
|
722
|
+
log.info(f"Deleting flash environment: {environment_id}")
|
|
723
|
+
|
|
724
|
+
result = await self._execute_graphql(mutation, variables)
|
|
725
|
+
return {"success": "deleteFlashEnvironment" in result}
|
|
726
|
+
|
|
727
|
+
async def endpoint_exists(self, endpoint_id: str) -> bool:
|
|
728
|
+
"""Check if an endpoint exists by querying the user's endpoint list."""
|
|
729
|
+
query = """
|
|
730
|
+
query {
|
|
731
|
+
myself {
|
|
732
|
+
endpoints {
|
|
733
|
+
id
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
"""
|
|
738
|
+
|
|
739
|
+
try:
|
|
740
|
+
result = await self._execute_graphql(query)
|
|
741
|
+
endpoints = result.get("myself", {}).get("endpoints", [])
|
|
742
|
+
endpoint_ids = [ep.get("id") for ep in endpoints]
|
|
743
|
+
exists = endpoint_id in endpoint_ids
|
|
744
|
+
|
|
745
|
+
log.debug(f"Endpoint {endpoint_id} exists: {exists}")
|
|
746
|
+
return exists
|
|
747
|
+
except Exception as e:
|
|
748
|
+
log.error(f"Error checking endpoint existence: {e}")
|
|
749
|
+
return False
|
|
203
750
|
|
|
204
751
|
async def close(self):
|
|
205
752
|
"""Close the HTTP session."""
|
|
@@ -222,7 +769,7 @@ class RunpodRestClient:
|
|
|
222
769
|
def __init__(self, api_key: Optional[str] = None):
|
|
223
770
|
self.api_key = api_key or os.getenv("RUNPOD_API_KEY")
|
|
224
771
|
if not self.api_key:
|
|
225
|
-
raise
|
|
772
|
+
raise RunpodAPIKeyError()
|
|
226
773
|
|
|
227
774
|
self.session: Optional[aiohttp.ClientSession] = None
|
|
228
775
|
|
|
@@ -267,36 +814,41 @@ class RunpodRestClient:
|
|
|
267
814
|
raise Exception(f"HTTP request failed: {e}")
|
|
268
815
|
|
|
269
816
|
async def create_network_volume(self, payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
270
|
-
"""
|
|
271
|
-
|
|
817
|
+
"""Create a network volume in Runpod."""
|
|
818
|
+
log.debug(f"Creating network volume: {payload.get('name', 'unnamed')}")
|
|
272
819
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
size_gb (int): The size of the volume in GB.
|
|
820
|
+
result = await self._execute_rest(
|
|
821
|
+
"POST", f"{RUNPOD_REST_API_URL}/networkvolumes", payload
|
|
822
|
+
)
|
|
277
823
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
# If datacenter_id is an enum, get its value
|
|
284
|
-
datacenter_id = datacenter_id.value
|
|
285
|
-
data = {
|
|
286
|
-
"dataCenterId": datacenter_id,
|
|
287
|
-
"name": payload.get("name"),
|
|
288
|
-
"size": payload.get("size"),
|
|
289
|
-
}
|
|
290
|
-
url = f"{RUNPOD_REST_API_URL}/networkvolumes"
|
|
824
|
+
log.info(
|
|
825
|
+
f"Created network volume: {result.get('id', 'unknown')} - {result.get('name', 'unnamed')}"
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
return result
|
|
291
829
|
|
|
292
|
-
|
|
830
|
+
async def list_network_volumes(self) -> Dict[str, Any]:
|
|
831
|
+
"""
|
|
832
|
+
List all network volumes in Runpod.
|
|
293
833
|
|
|
294
|
-
|
|
834
|
+
Returns:
|
|
835
|
+
List of network volume objects or dict containing networkVolumes key.
|
|
836
|
+
The API may return either format depending on version.
|
|
837
|
+
"""
|
|
838
|
+
log.debug("Listing network volumes")
|
|
295
839
|
|
|
296
|
-
|
|
297
|
-
|
|
840
|
+
result = await self._execute_rest(
|
|
841
|
+
"GET", f"{RUNPOD_REST_API_URL}/networkvolumes"
|
|
298
842
|
)
|
|
299
843
|
|
|
844
|
+
# Handle both list and dict responses
|
|
845
|
+
if isinstance(result, list):
|
|
846
|
+
volume_count = len(result)
|
|
847
|
+
else:
|
|
848
|
+
volume_count = len(result.get("networkVolumes", []))
|
|
849
|
+
|
|
850
|
+
log.debug(f"Listed {volume_count} network volumes")
|
|
851
|
+
|
|
300
852
|
return result
|
|
301
853
|
|
|
302
854
|
async def close(self):
|