tetra-rp 0.6.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. tetra_rp/__init__.py +109 -19
  2. tetra_rp/cli/commands/__init__.py +1 -0
  3. tetra_rp/cli/commands/apps.py +143 -0
  4. tetra_rp/cli/commands/build.py +1082 -0
  5. tetra_rp/cli/commands/build_utils/__init__.py +1 -0
  6. tetra_rp/cli/commands/build_utils/handler_generator.py +176 -0
  7. tetra_rp/cli/commands/build_utils/lb_handler_generator.py +309 -0
  8. tetra_rp/cli/commands/build_utils/manifest.py +430 -0
  9. tetra_rp/cli/commands/build_utils/mothership_handler_generator.py +75 -0
  10. tetra_rp/cli/commands/build_utils/scanner.py +596 -0
  11. tetra_rp/cli/commands/deploy.py +580 -0
  12. tetra_rp/cli/commands/init.py +123 -0
  13. tetra_rp/cli/commands/resource.py +108 -0
  14. tetra_rp/cli/commands/run.py +296 -0
  15. tetra_rp/cli/commands/test_mothership.py +458 -0
  16. tetra_rp/cli/commands/undeploy.py +533 -0
  17. tetra_rp/cli/main.py +97 -0
  18. tetra_rp/cli/utils/__init__.py +1 -0
  19. tetra_rp/cli/utils/app.py +15 -0
  20. tetra_rp/cli/utils/conda.py +127 -0
  21. tetra_rp/cli/utils/deployment.py +530 -0
  22. tetra_rp/cli/utils/ignore.py +143 -0
  23. tetra_rp/cli/utils/skeleton.py +184 -0
  24. tetra_rp/cli/utils/skeleton_template/.env.example +4 -0
  25. tetra_rp/cli/utils/skeleton_template/.flashignore +40 -0
  26. tetra_rp/cli/utils/skeleton_template/.gitignore +44 -0
  27. tetra_rp/cli/utils/skeleton_template/README.md +263 -0
  28. tetra_rp/cli/utils/skeleton_template/main.py +44 -0
  29. tetra_rp/cli/utils/skeleton_template/mothership.py +55 -0
  30. tetra_rp/cli/utils/skeleton_template/pyproject.toml +58 -0
  31. tetra_rp/cli/utils/skeleton_template/requirements.txt +1 -0
  32. tetra_rp/cli/utils/skeleton_template/workers/__init__.py +0 -0
  33. tetra_rp/cli/utils/skeleton_template/workers/cpu/__init__.py +19 -0
  34. tetra_rp/cli/utils/skeleton_template/workers/cpu/endpoint.py +36 -0
  35. tetra_rp/cli/utils/skeleton_template/workers/gpu/__init__.py +19 -0
  36. tetra_rp/cli/utils/skeleton_template/workers/gpu/endpoint.py +61 -0
  37. tetra_rp/client.py +136 -33
  38. tetra_rp/config.py +29 -0
  39. tetra_rp/core/api/runpod.py +591 -39
  40. tetra_rp/core/deployment.py +232 -0
  41. tetra_rp/core/discovery.py +425 -0
  42. tetra_rp/core/exceptions.py +50 -0
  43. tetra_rp/core/resources/__init__.py +27 -9
  44. tetra_rp/core/resources/app.py +738 -0
  45. tetra_rp/core/resources/base.py +139 -4
  46. tetra_rp/core/resources/constants.py +21 -0
  47. tetra_rp/core/resources/cpu.py +115 -13
  48. tetra_rp/core/resources/gpu.py +182 -16
  49. tetra_rp/core/resources/live_serverless.py +153 -16
  50. tetra_rp/core/resources/load_balancer_sls_resource.py +440 -0
  51. tetra_rp/core/resources/network_volume.py +126 -31
  52. tetra_rp/core/resources/resource_manager.py +436 -35
  53. tetra_rp/core/resources/serverless.py +537 -120
  54. tetra_rp/core/resources/serverless_cpu.py +201 -0
  55. tetra_rp/core/resources/template.py +1 -59
  56. tetra_rp/core/utils/constants.py +10 -0
  57. tetra_rp/core/utils/file_lock.py +260 -0
  58. tetra_rp/core/utils/http.py +67 -0
  59. tetra_rp/core/utils/lru_cache.py +75 -0
  60. tetra_rp/core/utils/singleton.py +36 -1
  61. tetra_rp/core/validation.py +44 -0
  62. tetra_rp/execute_class.py +301 -0
  63. tetra_rp/protos/remote_execution.py +98 -9
  64. tetra_rp/runtime/__init__.py +1 -0
  65. tetra_rp/runtime/circuit_breaker.py +274 -0
  66. tetra_rp/runtime/config.py +12 -0
  67. tetra_rp/runtime/exceptions.py +49 -0
  68. tetra_rp/runtime/generic_handler.py +206 -0
  69. tetra_rp/runtime/lb_handler.py +189 -0
  70. tetra_rp/runtime/load_balancer.py +160 -0
  71. tetra_rp/runtime/manifest_fetcher.py +192 -0
  72. tetra_rp/runtime/metrics.py +325 -0
  73. tetra_rp/runtime/models.py +73 -0
  74. tetra_rp/runtime/mothership_provisioner.py +512 -0
  75. tetra_rp/runtime/production_wrapper.py +266 -0
  76. tetra_rp/runtime/reliability_config.py +149 -0
  77. tetra_rp/runtime/retry_manager.py +118 -0
  78. tetra_rp/runtime/serialization.py +124 -0
  79. tetra_rp/runtime/service_registry.py +346 -0
  80. tetra_rp/runtime/state_manager_client.py +248 -0
  81. tetra_rp/stubs/live_serverless.py +35 -17
  82. tetra_rp/stubs/load_balancer_sls.py +357 -0
  83. tetra_rp/stubs/registry.py +145 -19
  84. {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/METADATA +398 -60
  85. tetra_rp-0.24.0.dist-info/RECORD +99 -0
  86. {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/WHEEL +1 -1
  87. tetra_rp-0.24.0.dist-info/entry_points.txt +2 -0
  88. tetra_rp/core/pool/cluster_manager.py +0 -177
  89. tetra_rp/core/pool/dataclass.py +0 -18
  90. tetra_rp/core/pool/ex.py +0 -38
  91. tetra_rp/core/pool/job.py +0 -22
  92. tetra_rp/core/pool/worker.py +0 -19
  93. tetra_rp/core/resources/utils.py +0 -50
  94. tetra_rp/core/utils/json.py +0 -33
  95. tetra_rp-0.6.0.dist-info/RECORD +0 -39
  96. /tetra_rp/{core/pool → cli}/__init__.py +0 -0
  97. {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/top_level.txt +0 -0
@@ -3,17 +3,53 @@ Direct GraphQL communication with Runpod API.
3
3
  Bypasses the outdated runpod-python SDK limitations.
4
4
  """
5
5
 
6
- import os
7
6
  import json
8
- import aiohttp
9
- from typing import Dict, Any, Optional
10
7
  import logging
8
+ import os
9
+ from typing import Any, Dict, Optional, List
10
+
11
+ import aiohttp
12
+ from aiohttp.resolver import ThreadedResolver
13
+
14
+ from tetra_rp.core.exceptions import RunpodAPIKeyError
15
+ from tetra_rp.runtime.exceptions import GraphQLMutationError, GraphQLQueryError
11
16
 
12
17
  log = logging.getLogger(__name__)
13
18
 
14
19
  RUNPOD_API_BASE_URL = os.environ.get("RUNPOD_API_BASE_URL", "https://api.runpod.io")
15
20
  RUNPOD_REST_API_URL = os.environ.get("RUNPOD_REST_API_URL", "https://rest.runpod.io/v1")
16
21
 
22
+ # Sensitive fields that should be redacted from logs (pre-signed URLs, tokens, etc.)
23
+ SENSITIVE_FIELDS = {"uploadUrl", "downloadUrl", "presignedUrl"}
24
+
25
+
26
+ def _sanitize_for_logging(data: Any, redaction_text: str = "<REDACTED>") -> Any:
27
+ """Recursively sanitize sensitive fields from data structures before logging.
28
+
29
+ Pre-signed URLs and other sensitive fields should not be logged as they
30
+ are temporary credentials that could be misused if exposed.
31
+
32
+ Args:
33
+ data: Data structure to sanitize (dict, list, or primitive)
34
+ redaction_text: Text to replace sensitive values with
35
+
36
+ Returns:
37
+ Sanitized copy of the data structure
38
+ """
39
+ if isinstance(data, dict):
40
+ return {
41
+ key: (
42
+ redaction_text
43
+ if key in SENSITIVE_FIELDS
44
+ else _sanitize_for_logging(value, redaction_text)
45
+ )
46
+ for key, value in data.items()
47
+ }
48
+ elif isinstance(data, list):
49
+ return [_sanitize_for_logging(item, redaction_text) for item in data]
50
+ else:
51
+ return data
52
+
17
53
 
18
54
  class RunpodGraphQLClient:
19
55
  """
@@ -26,7 +62,7 @@ class RunpodGraphQLClient:
26
62
  def __init__(self, api_key: Optional[str] = None):
27
63
  self.api_key = api_key or os.getenv("RUNPOD_API_KEY")
28
64
  if not self.api_key:
29
- raise ValueError("Runpod API key is required")
65
+ raise RunpodAPIKeyError()
30
66
 
31
67
  self.session: Optional[aiohttp.ClientSession] = None
32
68
 
@@ -34,12 +70,14 @@ class RunpodGraphQLClient:
34
70
  """Get or create an aiohttp session."""
35
71
  if self.session is None or self.session.closed:
36
72
  timeout = aiohttp.ClientTimeout(total=300) # 5 minute timeout
73
+ connector = aiohttp.TCPConnector(resolver=ThreadedResolver())
37
74
  self.session = aiohttp.ClientSession(
38
75
  timeout=timeout,
39
76
  headers={
40
77
  "Authorization": f"Bearer {self.api_key}",
41
78
  "Content-Type": "application/json",
42
79
  },
80
+ connector=connector,
43
81
  )
44
82
  return self.session
45
83
 
@@ -52,23 +90,31 @@ class RunpodGraphQLClient:
52
90
  payload = {"query": query, "variables": variables or {}}
53
91
 
54
92
  log.debug(f"GraphQL Query: {query}")
55
- log.debug(f"GraphQL Variables: {json.dumps(variables, indent=2)}")
93
+ sanitized_vars = _sanitize_for_logging(variables)
94
+ log.debug(f"GraphQL Variables: {json.dumps(sanitized_vars, indent=2)}")
56
95
 
57
96
  try:
58
97
  async with session.post(self.GRAPHQL_URL, json=payload) as response:
59
98
  response_data = await response.json()
60
99
 
61
100
  log.debug(f"GraphQL Response Status: {response.status}")
62
- log.debug(f"GraphQL Response: {json.dumps(response_data, indent=2)}")
101
+ sanitized_response = _sanitize_for_logging(response_data)
102
+ log.debug(
103
+ f"GraphQL Response: {json.dumps(sanitized_response, indent=2)}"
104
+ )
63
105
 
64
106
  if response.status >= 400:
107
+ sanitized_err = _sanitize_for_logging(response_data)
65
108
  raise Exception(
66
- f"GraphQL request failed: {response.status} - {response_data}"
109
+ f"GraphQL request failed: {response.status} - {sanitized_err}"
67
110
  )
68
111
 
69
112
  if "errors" in response_data:
70
113
  errors = response_data["errors"]
71
- error_msg = "; ".join([e.get("message", str(e)) for e in errors])
114
+ sanitized_errors = _sanitize_for_logging(errors)
115
+ error_msg = "; ".join(
116
+ [e.get("message", str(e)) for e in sanitized_errors]
117
+ )
72
118
  raise Exception(f"GraphQL errors: {error_msg}")
73
119
 
74
120
  return response_data.get("data", {})
@@ -77,9 +123,10 @@ class RunpodGraphQLClient:
77
123
  log.error(f"HTTP client error: {e}")
78
124
  raise Exception(f"HTTP request failed: {e}")
79
125
 
80
- async def create_endpoint(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
126
+ async def save_endpoint(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
81
127
  """
82
- Create a serverless endpoint using direct GraphQL mutation.
128
+ Create or update a serverless endpoint using direct GraphQL mutation.
129
+ When 'id' is included in the input, updates the existing endpoint.
83
130
  Supports both GPU and CPU endpoints with full field support.
84
131
  """
85
132
  # GraphQL mutation for saveEndpoint (based on actual schema)
@@ -93,6 +140,7 @@ class RunpodGraphQLClient:
93
140
  locations
94
141
  name
95
142
  networkVolumeId
143
+ flashEnvironmentId
96
144
  scalerType
97
145
  scalerValue
98
146
  templateId
@@ -115,9 +163,7 @@ class RunpodGraphQLClient:
115
163
 
116
164
  variables = {"input": input_data}
117
165
 
118
- log.debug(
119
- f"Creating endpoint with GraphQL: {input_data.get('name', 'unnamed')}"
120
- )
166
+ log.debug(f"Saving endpoint with GraphQL: {input_data.get('name', 'unnamed')}")
121
167
 
122
168
  result = await self._execute_graphql(mutation, variables)
123
169
 
@@ -126,7 +172,7 @@ class RunpodGraphQLClient:
126
172
 
127
173
  endpoint_data = result["saveEndpoint"]
128
174
  log.info(
129
- f"Created endpoint: {endpoint_data.get('id', 'unknown')} - {endpoint_data.get('name', 'unnamed')}"
175
+ f"Saved endpoint: {endpoint_data.get('id', 'unknown')} - {endpoint_data.get('name', 'unnamed')}"
130
176
  )
131
177
 
132
178
  return endpoint_data
@@ -199,7 +245,508 @@ class RunpodGraphQLClient:
199
245
  log.info(f"Deleting endpoint: {endpoint_id}")
200
246
 
201
247
  result = await self._execute_graphql(mutation, variables)
202
- return {"success": result.get("deleteEndpoint") is not None}
248
+
249
+ # If _execute_graphql didn't raise an exception, the deletion succeeded.
250
+ # The GraphQL mutation returns null on success, but presence of the key
251
+ # (even with null value) indicates the mutation executed.
252
+ # If the mutation failed, _execute_graphql would have raised an exception.
253
+
254
+ return {"success": "deleteEndpoint" in result}
255
+
256
+ async def list_flash_apps(self) -> List[Dict]:
257
+ """
258
+ List all flash apps in Runpod.
259
+ """
260
+ log.debug("Listing Flash apps")
261
+ query = """
262
+ query getFlashApps {
263
+ myself {
264
+ flashApps {
265
+ id
266
+ name
267
+ flashEnvironments {
268
+ id
269
+ name
270
+ state
271
+ createdAt
272
+ activeBuildId
273
+ }
274
+ flashBuilds {
275
+ id
276
+ createdAt
277
+ }
278
+ }
279
+ }
280
+ }
281
+ """
282
+
283
+ result = await self._execute_graphql(query)
284
+ return result["myself"].get("flashApps", [])
285
+
286
+ async def prepare_artifact_upload(
287
+ self, input_data: Dict[str, Any]
288
+ ) -> Dict[str, Any]:
289
+ mutation = """
290
+ mutation PrepareArtifactUpload($input: PrepareFlashArtifactUploadInput!) {
291
+ prepareFlashArtifactUpload(input: $input) {
292
+ uploadUrl
293
+ objectKey
294
+ expiresAt
295
+ }
296
+ }
297
+ """
298
+ variables = {"input": input_data}
299
+
300
+ log.debug(f"Preparing upload url for flash environment: {input_data}")
301
+
302
+ result = await self._execute_graphql(mutation, variables)
303
+ return result["prepareFlashArtifactUpload"]
304
+
305
+ async def finalize_artifact_upload(
306
+ self, input_data: Dict[str, Any]
307
+ ) -> Dict[str, Any]:
308
+ mutation = """
309
+ mutation FinalizeArtifactUpload($input: FinalizeFlashArtifactUploadInput!) {
310
+ finalizeFlashArtifactUpload(input: $input) {
311
+ id
312
+ manifest
313
+ }
314
+ }
315
+ """
316
+ variables = {"input": input_data}
317
+
318
+ log.debug(f"finalizing upload for flash app: {input_data}")
319
+
320
+ result = await self._execute_graphql(mutation, variables)
321
+ return result["finalizeFlashArtifactUpload"]
322
+
323
+ async def get_flash_app(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
324
+ query = """
325
+ query getFlashApp($input: String!) {
326
+ flashApp(flashAppId: $input) {
327
+ id
328
+ name
329
+ flashEnvironments {
330
+ id
331
+ name
332
+ state
333
+ }
334
+ flashBuilds {
335
+ id
336
+ objectKey
337
+ createdAt
338
+ }
339
+ }
340
+ }
341
+ """
342
+ variables = {"input": input_data}
343
+
344
+ log.debug(f"Fetching flash app for input: {input_data}")
345
+ result = await self._execute_graphql(query, variables)
346
+ return result["flashApp"]
347
+
348
+ async def get_flash_app_by_name(self, app_name: str) -> Dict[str, Any]:
349
+ query = """
350
+ query getFlashAppByName($flashAppName: String!) {
351
+ flashAppByName(flashAppName: $flashAppName) {
352
+ id
353
+ name
354
+ flashEnvironments {
355
+ id
356
+ name
357
+ state
358
+ activeBuildId
359
+ createdAt
360
+ }
361
+ flashBuilds {
362
+ id
363
+ objectKey
364
+ createdAt
365
+ }
366
+ }
367
+ }
368
+ """
369
+ variables = {"flashAppName": app_name}
370
+
371
+ log.debug(f"Fetching flash app by name for input: {app_name}")
372
+ result = await self._execute_graphql(query, variables)
373
+ return result["flashAppByName"]
374
+
375
+ async def get_flash_environment(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
376
+ query = """
377
+ query getFlashEnvironment($flashEnvironmentId: String!) {
378
+ flashEnvironment(flashEnvironmentId: $flashEnvironmentId) {
379
+ id
380
+ name
381
+ state
382
+ activeBuildId
383
+ createdAt
384
+ endpoints {
385
+ id
386
+ name
387
+ }
388
+ networkVolumes {
389
+ id
390
+ name
391
+ }
392
+ }
393
+ }
394
+ """
395
+ variables = {**input_data}
396
+
397
+ log.debug(f"Fetching flash environment for input: {variables}")
398
+ result = await self._execute_graphql(query, variables)
399
+ return result["flashEnvironment"]
400
+
401
+ async def get_flash_environment_by_name(
402
+ self, input_data: Dict[str, Any]
403
+ ) -> Dict[str, Any]:
404
+ query = """
405
+ query getFlashEnvironmentByName($input: FlashEnvironmentByNameInput!) {
406
+ flashEnvironmentByName(input: $input) {
407
+ id
408
+ name
409
+ state
410
+ activeBuildId
411
+ endpoints {
412
+ id
413
+ name
414
+ }
415
+ networkVolumes {
416
+ id
417
+ name
418
+ }
419
+ }
420
+ }
421
+ """
422
+ variables = {"input": input_data}
423
+
424
+ log.debug(f"Fetching flash environment by name for input: {variables}")
425
+ result = await self._execute_graphql(query, variables)
426
+
427
+ return result["flashEnvironmentByName"]
428
+
429
+ async def update_build_manifest(
430
+ self,
431
+ build_id: str,
432
+ manifest: Dict[str, Any],
433
+ ) -> None:
434
+ mutation = """
435
+ mutation updateFlashBuildManifest($input: UpdateFlashBuildManifestInput!) {
436
+ updateFlashBuildManifest(input: $input) {
437
+ id
438
+ manifest
439
+ }
440
+ }
441
+ """
442
+ variables = {"input": {"flashBuildId": build_id, "manifest": manifest}}
443
+ result = await self._execute_graphql(mutation, variables)
444
+
445
+ if "updateFlashBuildManifest" not in result:
446
+ raise GraphQLMutationError(
447
+ f"updateFlashBuildManifest mutation failed for build {build_id}. "
448
+ f"Expected 'updateFlashBuildManifest' in response, got: {list(result.keys())}"
449
+ )
450
+
451
+ async def get_flash_artifact_url(self, environment_id: str) -> Dict[str, Any]:
452
+ result = await self.get_flash_environment(
453
+ {"flashEnvironmentId": environment_id}
454
+ )
455
+ return result
456
+
457
+ async def deploy_build_to_environment(
458
+ self, input_data: Dict[str, Any]
459
+ ) -> Dict[str, Any]:
460
+ # TODO(jhcipar) should we not generate a presigned url when promoting a build here?
461
+ mutation = """
462
+ mutation deployBuildToEnvironment($input: DeployBuildToEnvironmentInput!) {
463
+ deployBuildToEnvironment(input: $input) {
464
+ id
465
+ name
466
+ activeArtifact {
467
+ objectKey
468
+ downloadUrl
469
+ expiresAt
470
+ }
471
+ }
472
+ }
473
+ """
474
+
475
+ variables = {"input": input_data}
476
+
477
+ log.debug(f"Deploying flash environment with vars: {input_data}")
478
+
479
+ result = await self._execute_graphql(mutation, variables)
480
+ return result["deployBuildToEnvironment"]
481
+
482
+ async def create_flash_app(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
483
+ """Create a new flash app in Runpod."""
484
+ log.debug(f"creating flash app with name {input_data.get('name')}")
485
+
486
+ mutation = """
487
+ mutation createFlashApp($input: CreateFlashAppInput!) {
488
+ createFlashApp(input: $input) {
489
+ id
490
+ name
491
+ }
492
+ }
493
+ """
494
+
495
+ variables = {"input": input_data}
496
+
497
+ log.debug(
498
+ f"Creating flash app with GraphQL: {input_data.get('name', 'unnamed')}"
499
+ )
500
+
501
+ result = await self._execute_graphql(mutation, variables)
502
+
503
+ return result["createFlashApp"]
504
+
505
+ async def create_flash_environment(
506
+ self, input_data: Dict[str, Any]
507
+ ) -> Dict[str, Any]:
508
+ """Create an environment within a flash app."""
509
+ log.debug(f"creating flash environment with name {input_data.get('name')}")
510
+
511
+ mutation = """
512
+ mutation createFlashEnvironment($input: CreateFlashEnvironmentInput!) {
513
+ createFlashEnvironment(input: $input) {
514
+ id
515
+ name
516
+ }
517
+ }
518
+ """
519
+
520
+ variables = {"input": input_data}
521
+
522
+ log.debug(
523
+ f"Creating flash environment with GraphQL: {input_data.get('name', 'unnamed')}"
524
+ )
525
+
526
+ result = await self._execute_graphql(mutation, variables)
527
+
528
+ return result["createFlashEnvironment"]
529
+
530
+ async def register_endpoint_to_environment(
531
+ self, input_data: Dict[str, Any]
532
+ ) -> Dict[str, Any]:
533
+ """Register an endpoint to a Flash environment"""
534
+
535
+ log.debug(
536
+ f"Registering endpoint to flash environment with input data: {input_data}"
537
+ )
538
+
539
+ mutation = """
540
+ mutation addEndpointToFlashEnvironment($input: AddEndpointToEnvironmentInput!) {
541
+ addEndpointToFlashEnvironment(input: $input) {
542
+ id
543
+ name
544
+ flashEnvironmentId
545
+ }
546
+ }
547
+ """
548
+
549
+ variables = {"input": input_data}
550
+
551
+ result = await self._execute_graphql(mutation, variables)
552
+
553
+ return result["addEndpointToFlashEnvironment"]
554
+
555
+ async def register_network_volume_to_environment(
556
+ self, input_data: Dict[str, Any]
557
+ ) -> Dict[str, Any]:
558
+ """Register an endpoint to a Flash environment"""
559
+
560
+ log.debug(
561
+ f"Registering endpoint to flash environment with input data: {input_data}"
562
+ )
563
+
564
+ mutation = """
565
+ mutation addNetworkVolumeToFlashEnvironment($input: AddNetworkVolumeToEnvironmentInput!) {
566
+ addNetworkVolumeToFlashEnvironment(input: $input) {
567
+ id
568
+ name
569
+ flashEnvironmentId
570
+ }
571
+ }
572
+ """
573
+
574
+ variables = {"input": input_data}
575
+
576
+ result = await self._execute_graphql(mutation, variables)
577
+
578
+ return result["addNetworkVolumeToFlashEnvironment"]
579
+
580
+ async def set_environment_state(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
581
+ log.debug(f"Setting Flash environment status with input data: {input_data}")
582
+
583
+ mutation = """
584
+ mutation updateFlashEnvironment($input: UpdateFlashEnvironmentInput!) {
585
+ updateFlashEnvironment(input: $input) {
586
+ id
587
+ name
588
+ state
589
+ }
590
+ }
591
+ """
592
+
593
+ variables = {"input": input_data}
594
+
595
+ result = await self._execute_graphql(mutation, variables)
596
+
597
+ return result["updateFlashEnvironment"]
598
+
599
+ async def get_flash_build(self, build_id: str) -> Dict[str, Any]:
600
+ """Fetch flash build by ID.
601
+
602
+ Args:
603
+ build_id: Build ID string (UUID format).
604
+
605
+ Returns:
606
+ Build data including id and manifest.
607
+
608
+ Raises:
609
+ TypeError: If build_id is not a string.
610
+ GraphQLQueryError: If build not found or query fails.
611
+
612
+ Note:
613
+ API changed in PR #144:
614
+ - Previously accepted Dict[str, Any], now requires string build_id directly
615
+ - Query now requests 'manifest' field instead of 'name' field
616
+ """
617
+ if not isinstance(build_id, str):
618
+ raise TypeError(
619
+ f"get_flash_build() expects build_id as str, got {type(build_id).__name__}. "
620
+ f"API changed in PR #144 - update caller to pass build_id string directly."
621
+ )
622
+
623
+ query = """
624
+ query getFlashBuild($input: String!) {
625
+ flashBuild(flashBuildId: $input) {
626
+ id
627
+ manifest
628
+ }
629
+ }
630
+ """
631
+ variables = {"input": build_id}
632
+
633
+ log.debug(f"Fetching flash build for input: {build_id}")
634
+ result = await self._execute_graphql(query, variables)
635
+
636
+ if "flashBuild" not in result:
637
+ raise GraphQLQueryError(
638
+ f"get_flash_build query failed for build {build_id}. "
639
+ f"Expected 'flashBuild' in response, got: {list(result.keys())}"
640
+ )
641
+
642
+ return result["flashBuild"]
643
+
644
+ async def list_flash_builds_by_app_id(self, app_id: str) -> List[Dict[str, Any]]:
645
+ """List all builds for a flash app by app ID (optimized query).
646
+
647
+ Args:
648
+ app_id: The flash app ID
649
+
650
+ Returns:
651
+ List of build dictionaries with id, objectKey, createdAt fields
652
+ """
653
+ query = """
654
+ query listFlashBuilds($flashAppId: String!) {
655
+ flashApp(flashAppId: $flashAppId) {
656
+ flashBuilds {
657
+ id
658
+ objectKey
659
+ createdAt
660
+ }
661
+ }
662
+ }
663
+ """
664
+ variables = {"flashAppId": app_id}
665
+
666
+ log.debug(f"Listing flash builds for app: {app_id}")
667
+ result = await self._execute_graphql(query, variables)
668
+ return result["flashApp"]["flashBuilds"]
669
+
670
+ async def list_flash_environments_by_app_id(
671
+ self, app_id: str
672
+ ) -> List[Dict[str, Any]]:
673
+ """List all environments for a flash app by app ID (optimized query).
674
+
675
+ Args:
676
+ app_id: The flash app ID
677
+
678
+ Returns:
679
+ List of environment dictionaries with id, name, state, activeBuildId, createdAt fields
680
+ """
681
+ query = """
682
+ query listFlashEnvironments($flashAppId: String!) {
683
+ flashApp(flashAppId: $flashAppId) {
684
+ flashEnvironments {
685
+ id
686
+ name
687
+ state
688
+ activeBuildId
689
+ createdAt
690
+ }
691
+ }
692
+ }
693
+ """
694
+ variables = {"flashAppId": app_id}
695
+
696
+ log.debug(f"Listing flash environments for app: {app_id}")
697
+ result = await self._execute_graphql(query, variables)
698
+ return result["flashApp"]["flashEnvironments"]
699
+
700
+ async def delete_flash_app(self, app_id: str) -> Dict[str, Any]:
701
+ mutation = """
702
+ mutation deleteFlashApp($flashAppId: String!) {
703
+ deleteFlashApp(flashAppId: $flashAppId)
704
+ }
705
+ """
706
+
707
+ variables = {"flashAppId": app_id}
708
+ log.info(f"Deleting flash app: {app_id}")
709
+
710
+ result = await self._execute_graphql(mutation, variables)
711
+ return {"success": "deleteFlashApp" in result}
712
+
713
+ async def delete_flash_environment(self, environment_id: str) -> Dict[str, Any]:
714
+ """Delete a flash environment."""
715
+ mutation = """
716
+ mutation deleteFlashEnvironment($flashEnvironmentId: String!) {
717
+ deleteFlashEnvironment(flashEnvironmentId: $flashEnvironmentId)
718
+ }
719
+ """
720
+
721
+ variables = {"flashEnvironmentId": environment_id}
722
+ log.info(f"Deleting flash environment: {environment_id}")
723
+
724
+ result = await self._execute_graphql(mutation, variables)
725
+ return {"success": "deleteFlashEnvironment" in result}
726
+
727
+ async def endpoint_exists(self, endpoint_id: str) -> bool:
728
+ """Check if an endpoint exists by querying the user's endpoint list."""
729
+ query = """
730
+ query {
731
+ myself {
732
+ endpoints {
733
+ id
734
+ }
735
+ }
736
+ }
737
+ """
738
+
739
+ try:
740
+ result = await self._execute_graphql(query)
741
+ endpoints = result.get("myself", {}).get("endpoints", [])
742
+ endpoint_ids = [ep.get("id") for ep in endpoints]
743
+ exists = endpoint_id in endpoint_ids
744
+
745
+ log.debug(f"Endpoint {endpoint_id} exists: {exists}")
746
+ return exists
747
+ except Exception as e:
748
+ log.error(f"Error checking endpoint existence: {e}")
749
+ return False
203
750
 
204
751
  async def close(self):
205
752
  """Close the HTTP session."""
@@ -222,7 +769,7 @@ class RunpodRestClient:
222
769
  def __init__(self, api_key: Optional[str] = None):
223
770
  self.api_key = api_key or os.getenv("RUNPOD_API_KEY")
224
771
  if not self.api_key:
225
- raise ValueError("Runpod API key is required")
772
+ raise RunpodAPIKeyError()
226
773
 
227
774
  self.session: Optional[aiohttp.ClientSession] = None
228
775
 
@@ -267,36 +814,41 @@ class RunpodRestClient:
267
814
  raise Exception(f"HTTP request failed: {e}")
268
815
 
269
816
  async def create_network_volume(self, payload: Dict[str, Any]) -> Dict[str, Any]:
270
- """
271
- Create a network volume in Runpod.
817
+ """Create a network volume in Runpod."""
818
+ log.debug(f"Creating network volume: {payload.get('name', 'unnamed')}")
272
819
 
273
- Args:
274
- datacenter_id (str): The ID of the datacenter where the volume will be created.
275
- name (str): The name of the network volume.
276
- size_gb (int): The size of the volume in GB.
820
+ result = await self._execute_rest(
821
+ "POST", f"{RUNPOD_REST_API_URL}/networkvolumes", payload
822
+ )
277
823
 
278
- Returns:
279
- Dict[str, Any]: The created network volume details.
280
- """
281
- datacenter_id = payload.get("dataCenterId")
282
- if hasattr(datacenter_id, "value"):
283
- # If datacenter_id is an enum, get its value
284
- datacenter_id = datacenter_id.value
285
- data = {
286
- "dataCenterId": datacenter_id,
287
- "name": payload.get("name"),
288
- "size": payload.get("size"),
289
- }
290
- url = f"{RUNPOD_REST_API_URL}/networkvolumes"
824
+ log.info(
825
+ f"Created network volume: {result.get('id', 'unknown')} - {result.get('name', 'unnamed')}"
826
+ )
827
+
828
+ return result
291
829
 
292
- log.debug(f"Creating network volume: {data.get('name', 'unnamed')}")
830
+ async def list_network_volumes(self) -> Dict[str, Any]:
831
+ """
832
+ List all network volumes in Runpod.
293
833
 
294
- result = await self._execute_rest("POST", url, data)
834
+ Returns:
835
+ List of network volume objects or dict containing networkVolumes key.
836
+ The API may return either format depending on version.
837
+ """
838
+ log.debug("Listing network volumes")
295
839
 
296
- log.info(
297
- f"Created network volume: {result.get('id', 'unknown')} - {result.get('name', 'unnamed')}"
840
+ result = await self._execute_rest(
841
+ "GET", f"{RUNPOD_REST_API_URL}/networkvolumes"
298
842
  )
299
843
 
844
+ # Handle both list and dict responses
845
+ if isinstance(result, list):
846
+ volume_count = len(result)
847
+ else:
848
+ volume_count = len(result.get("networkVolumes", []))
849
+
850
+ log.debug(f"Listed {volume_count} network volumes")
851
+
300
852
  return result
301
853
 
302
854
  async def close(self):