tetra-rp 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tetra-rp might be problematic. Click here for more details.

Files changed (66) hide show
  1. tetra_rp/__init__.py +43 -0
  2. tetra_rp/cli/__init__.py +0 -0
  3. tetra_rp/cli/commands/__init__.py +1 -0
  4. tetra_rp/cli/commands/build.py +534 -0
  5. tetra_rp/cli/commands/deploy.py +370 -0
  6. tetra_rp/cli/commands/init.py +119 -0
  7. tetra_rp/cli/commands/resource.py +191 -0
  8. tetra_rp/cli/commands/run.py +100 -0
  9. tetra_rp/cli/main.py +85 -0
  10. tetra_rp/cli/utils/__init__.py +1 -0
  11. tetra_rp/cli/utils/conda.py +127 -0
  12. tetra_rp/cli/utils/deployment.py +172 -0
  13. tetra_rp/cli/utils/ignore.py +139 -0
  14. tetra_rp/cli/utils/skeleton.py +184 -0
  15. tetra_rp/cli/utils/skeleton_template/.env.example +3 -0
  16. tetra_rp/cli/utils/skeleton_template/.flashignore +40 -0
  17. tetra_rp/cli/utils/skeleton_template/.gitignore +44 -0
  18. tetra_rp/cli/utils/skeleton_template/README.md +256 -0
  19. tetra_rp/cli/utils/skeleton_template/main.py +43 -0
  20. tetra_rp/cli/utils/skeleton_template/requirements.txt +1 -0
  21. tetra_rp/cli/utils/skeleton_template/workers/__init__.py +0 -0
  22. tetra_rp/cli/utils/skeleton_template/workers/cpu/__init__.py +20 -0
  23. tetra_rp/cli/utils/skeleton_template/workers/cpu/endpoint.py +38 -0
  24. tetra_rp/cli/utils/skeleton_template/workers/gpu/__init__.py +20 -0
  25. tetra_rp/cli/utils/skeleton_template/workers/gpu/endpoint.py +62 -0
  26. tetra_rp/client.py +128 -0
  27. tetra_rp/config.py +29 -0
  28. tetra_rp/core/__init__.py +0 -0
  29. tetra_rp/core/api/__init__.py +6 -0
  30. tetra_rp/core/api/runpod.py +319 -0
  31. tetra_rp/core/exceptions.py +50 -0
  32. tetra_rp/core/resources/__init__.py +37 -0
  33. tetra_rp/core/resources/base.py +47 -0
  34. tetra_rp/core/resources/cloud.py +4 -0
  35. tetra_rp/core/resources/constants.py +4 -0
  36. tetra_rp/core/resources/cpu.py +146 -0
  37. tetra_rp/core/resources/environment.py +41 -0
  38. tetra_rp/core/resources/gpu.py +68 -0
  39. tetra_rp/core/resources/live_serverless.py +62 -0
  40. tetra_rp/core/resources/network_volume.py +148 -0
  41. tetra_rp/core/resources/resource_manager.py +145 -0
  42. tetra_rp/core/resources/serverless.py +463 -0
  43. tetra_rp/core/resources/serverless_cpu.py +162 -0
  44. tetra_rp/core/resources/template.py +94 -0
  45. tetra_rp/core/resources/utils.py +50 -0
  46. tetra_rp/core/utils/__init__.py +0 -0
  47. tetra_rp/core/utils/backoff.py +43 -0
  48. tetra_rp/core/utils/constants.py +10 -0
  49. tetra_rp/core/utils/file_lock.py +260 -0
  50. tetra_rp/core/utils/json.py +33 -0
  51. tetra_rp/core/utils/lru_cache.py +75 -0
  52. tetra_rp/core/utils/singleton.py +21 -0
  53. tetra_rp/core/validation.py +44 -0
  54. tetra_rp/execute_class.py +319 -0
  55. tetra_rp/logger.py +34 -0
  56. tetra_rp/protos/__init__.py +0 -0
  57. tetra_rp/protos/remote_execution.py +148 -0
  58. tetra_rp/stubs/__init__.py +5 -0
  59. tetra_rp/stubs/live_serverless.py +155 -0
  60. tetra_rp/stubs/registry.py +117 -0
  61. tetra_rp/stubs/serverless.py +30 -0
  62. tetra_rp-0.17.1.dist-info/METADATA +976 -0
  63. tetra_rp-0.17.1.dist-info/RECORD +66 -0
  64. tetra_rp-0.17.1.dist-info/WHEEL +5 -0
  65. tetra_rp-0.17.1.dist-info/entry_points.txt +2 -0
  66. tetra_rp-0.17.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,146 @@
1
+ from enum import Enum
2
+ from typing import List, Optional
3
+
4
+
5
+ class CpuInstanceType(str, Enum):
6
+ """Valid CPU instance types.
7
+
8
+ Format: {generation}{type}-{vcpu}-{memory_gb}
9
+ Based on Runpod backend validation logic:
10
+ - memoryInGb = vcpuCount * flavor.ramMultiplier
11
+
12
+ RAM Multipliers (DEV environment):
13
+ - cpu3g: 4.0 (1 vCPU = 4GB, 2 vCPU = 8GB, etc.)
14
+ - cpu3c: 2.0 (1 vCPU = 2GB, 2 vCPU = 4GB, etc.)
15
+ - cpu5c: 2.0 (1 vCPU = 2GB, 2 vCPU = 4GB, etc.)
16
+ - cpu5g: Not available
17
+ """
18
+
19
+ ANY = "any"
20
+ """Any CPU"""
21
+
22
+ # 3rd Generation General Purpose (RAM multiplier: 4.0)
23
+
24
+ CPU3G_1_4 = "cpu3g-1-4"
25
+ """1 vCPU, 4GB RAM, max 10GB container disk"""
26
+
27
+ CPU3G_2_8 = "cpu3g-2-8"
28
+ """2 vCPU, 8GB RAM, max 20GB container disk"""
29
+
30
+ CPU3G_4_16 = "cpu3g-4-16"
31
+ """4 vCPU, 16GB RAM, max 40GB container disk"""
32
+
33
+ CPU3G_8_32 = "cpu3g-8-32"
34
+ """8 vCPU, 32GB RAM, max 80GB container disk"""
35
+
36
+ # 3rd Generation Compute-Optimized (RAM multiplier: 2.0)
37
+
38
+ CPU3C_1_2 = "cpu3c-1-2"
39
+ """1 vCPU, 2GB RAM, max 10GB container disk"""
40
+
41
+ CPU3C_2_4 = "cpu3c-2-4"
42
+ """2 vCPU, 4GB RAM, max 20GB container disk"""
43
+
44
+ CPU3C_4_8 = "cpu3c-4-8"
45
+ """4 vCPU, 8GB RAM, max 40GB container disk"""
46
+
47
+ CPU3C_8_16 = "cpu3c-8-16"
48
+ """8 vCPU, 16GB RAM, max 80GB container disk"""
49
+
50
+ # 5th Generation Compute-Optimized (RAM multiplier: 2.0)
51
+
52
+ CPU5C_1_2 = "cpu5c-1-2"
53
+ """1 vCPU, 2GB RAM, max 15GB container disk"""
54
+
55
+ CPU5C_2_4 = "cpu5c-2-4"
56
+ """2 vCPU, 4GB RAM, max 30GB container disk"""
57
+
58
+ CPU5C_4_8 = "cpu5c-4-8"
59
+ """4 vCPU, 8GB RAM, max 60GB container disk"""
60
+
61
+ CPU5C_8_16 = "cpu5c-8-16"
62
+ """8 vCPU, 16GB RAM, max 120GB container disk"""
63
+
64
+ @classmethod
65
+ def all(cls) -> List["CpuInstanceType"]:
66
+ """Returns all CPU Instance Types."""
67
+ return [c for c in cls if c != cls.ANY]
68
+
69
+
70
+ def calculate_max_disk_size(instance_type: CpuInstanceType) -> int:
71
+ """
72
+ Calculate the maximum container disk size for a CPU instance type.
73
+
74
+ Formula:
75
+ - CPU3G/CPU3C: vCPU count × 10GB
76
+ - CPU5C: vCPU count × 15GB
77
+
78
+ Args:
79
+ instance_type: CPU instance type enum
80
+
81
+ Returns:
82
+ Maximum container disk size in GB
83
+
84
+ Example:
85
+ >>> calculate_max_disk_size(CpuInstanceType.CPU3G_1_4)
86
+ 10
87
+ >>> calculate_max_disk_size(CpuInstanceType.CPU5C_2_4)
88
+ 30
89
+ """
90
+ # Parse the instance type string to extract vCPU count
91
+ # Format: "cpu{generation}{type}-{vcpu}-{memory}"
92
+ instance_str = instance_type.value
93
+ parts = instance_str.split("-")
94
+
95
+ if len(parts) != 3:
96
+ raise ValueError(f"Invalid instance type format: {instance_str}")
97
+
98
+ vcpu_count = int(parts[1])
99
+
100
+ # Determine disk multiplier based on generation
101
+ if instance_str.startswith("cpu5c"):
102
+ disk_multiplier = 15 # CPU5C: 15GB per vCPU
103
+ elif instance_str.startswith(("cpu3g", "cpu3c")):
104
+ disk_multiplier = 10 # CPU3G/CPU3C: 10GB per vCPU
105
+ else:
106
+ raise ValueError(f"Unknown CPU generation/type: {instance_str}")
107
+
108
+ return vcpu_count * disk_multiplier
109
+
110
+
111
+ # CPU Instance Type Disk Limits (calculated programmatically)
112
+ CPU_INSTANCE_DISK_LIMITS = {
113
+ instance_type: calculate_max_disk_size(instance_type)
114
+ for instance_type in CpuInstanceType
115
+ if instance_type != CpuInstanceType.ANY
116
+ }
117
+
118
+
119
+ def get_max_disk_size_for_instances(
120
+ instance_types: Optional[List[CpuInstanceType]],
121
+ ) -> Optional[int]:
122
+ """
123
+ Calculate the maximum container disk size for a list of CPU instance types.
124
+
125
+ Returns the minimum disk limit across all instance types to ensure compatibility
126
+ with all specified instances.
127
+
128
+ Args:
129
+ instance_types: List of CPU instance types, or None
130
+
131
+ Returns:
132
+ Maximum allowed disk size in GB, or None if no CPU instances specified
133
+
134
+ Example:
135
+ >>> get_max_disk_size_for_instances([CpuInstanceType.CPU3G_1_4])
136
+ 10
137
+ >>> get_max_disk_size_for_instances([CpuInstanceType.CPU3G_1_4, CpuInstanceType.CPU3G_2_8])
138
+ 10
139
+ """
140
+ if not instance_types:
141
+ return None
142
+
143
+ disk_limits = [
144
+ CPU_INSTANCE_DISK_LIMITS[instance_type] for instance_type in instance_types
145
+ ]
146
+ return min(disk_limits)
@@ -0,0 +1,41 @@
1
+ from typing import Dict, Optional
2
+ from dotenv import dotenv_values
3
+
4
+
5
+ class EnvironmentVars:
6
+ def __init__(self):
7
+ # Store environment variables from .env file
8
+ self.env = self._load_env()
9
+
10
+ def _load_env(self) -> Dict[str, str]:
11
+ """
12
+ Loads environment variables specifically from the .env file
13
+ and returns them as a dictionary.
14
+
15
+ Returns:
16
+ Dict[str, str]: Dictionary containing environment variables from .env file
17
+ """
18
+ # Use dotenv_values instead of load_dotenv to get only variables from .env
19
+ return dict(dotenv_values())
20
+
21
+ def get_env(self) -> Dict[str, str]:
22
+ """
23
+ Returns the dictionary of environment variables.
24
+
25
+ Returns:
26
+ Dict[str, str]: Dictionary containing environment variables
27
+ """
28
+ return self.env
29
+
30
+ def get_value(self, key: str, default: str = None) -> Optional[str]:
31
+ """
32
+ Gets a specific environment variable by key.
33
+
34
+ Args:
35
+ key (str): The environment variable key
36
+ default (str, optional): Default value if key doesn't exist
37
+
38
+ Returns:
39
+ Optional[str]: Value of the environment variable or default
40
+ """
41
+ return self.env.get(key, default)
@@ -0,0 +1,68 @@
1
+ from typing import Optional, List
2
+ from pydantic import BaseModel
3
+ from enum import Enum
4
+
5
+
6
+ class GpuLowestPrice(BaseModel):
7
+ minimumBidPrice: Optional[float] = None
8
+ uninterruptablePrice: Optional[float] = None
9
+
10
+
11
+ class GpuType(BaseModel):
12
+ id: str
13
+ displayName: str
14
+ memoryInGb: int
15
+
16
+
17
+ class GpuTypeDetail(GpuType):
18
+ communityCloud: Optional[bool] = None
19
+ communityPrice: Optional[float] = None
20
+ communitySpotPrice: Optional[float] = None
21
+ cudaCores: Optional[int] = None
22
+ lowestPrice: Optional[GpuLowestPrice] = None
23
+ manufacturer: Optional[str] = None
24
+ maxGpuCount: Optional[int] = None
25
+ oneMonthPrice: Optional[float] = None
26
+ oneWeekPrice: Optional[float] = None
27
+ secureCloud: Optional[bool] = None
28
+ securePrice: Optional[float] = None
29
+ secureSpotPrice: Optional[float] = None
30
+ threeMonthPrice: Optional[float] = None
31
+
32
+
33
+ # TODO: this should be fetched from an API
34
+ class GpuGroup(Enum):
35
+ ANY = "any"
36
+ """Any GPU"""
37
+
38
+ ADA_24 = "ADA_24"
39
+ """NVIDIA GeForce RTX 4090"""
40
+
41
+ ADA_32_PRO = "ADA_32_PRO"
42
+ """NVIDIA GeForce RTX 5090"""
43
+
44
+ ADA_48_PRO = "ADA_48_PRO"
45
+ """NVIDIA RTX 6000 Ada Generation, NVIDIA L40, NVIDIA L40S"""
46
+
47
+ ADA_80_PRO = "ADA_80_PRO"
48
+ """NVIDIA H100 PCIe, NVIDIA H100 80GB HBM3, NVIDIA H100 NVL"""
49
+
50
+ AMPERE_16 = "AMPERE_16"
51
+ """NVIDIA RTX A4000, NVIDIA RTX A4500, NVIDIA RTX 4000 Ada Generation, NVIDIA RTX 2000 Ada Generation"""
52
+
53
+ AMPERE_24 = "AMPERE_24"
54
+ """NVIDIA RTX A5000, NVIDIA L4, NVIDIA GeForce RTX 3090"""
55
+
56
+ AMPERE_48 = "AMPERE_48"
57
+ """NVIDIA A40, NVIDIA RTX A6000"""
58
+
59
+ AMPERE_80 = "AMPERE_80"
60
+ """NVIDIA A100 80GB PCIe, NVIDIA A100-SXM4-80GB"""
61
+
62
+ HOPPER_141 = "HOPPER_141"
63
+ """NVIDIA H200"""
64
+
65
+ @classmethod
66
+ def all(cls) -> List["GpuGroup"]:
67
+ """Returns all GPU groups."""
68
+ return [cls.AMPERE_48] + [g for g in cls if g != cls.ANY]
@@ -0,0 +1,62 @@
1
+ # Ship serverless code as you write it. No builds, no deploys — just run.
2
+ import os
3
+ from pydantic import model_validator
4
+ from .serverless import ServerlessEndpoint
5
+ from .serverless_cpu import CpuServerlessEndpoint
6
+
7
+ TETRA_IMAGE_TAG = os.environ.get("TETRA_IMAGE_TAG", "latest")
8
+ TETRA_GPU_IMAGE = os.environ.get(
9
+ "TETRA_GPU_IMAGE", f"runpod/tetra-rp:{TETRA_IMAGE_TAG}"
10
+ )
11
+ TETRA_CPU_IMAGE = os.environ.get(
12
+ "TETRA_CPU_IMAGE", f"runpod/tetra-rp-cpu:{TETRA_IMAGE_TAG}"
13
+ )
14
+
15
+
16
+ class LiveServerlessMixin:
17
+ """Common mixin for live serverless endpoints that locks the image."""
18
+
19
+ @property
20
+ def _live_image(self) -> str:
21
+ """Override in subclasses to specify the locked image."""
22
+ raise NotImplementedError("Subclasses must define _live_image")
23
+
24
+ @property
25
+ def imageName(self):
26
+ # Lock imageName to specific image
27
+ return self._live_image
28
+
29
+ @imageName.setter
30
+ def imageName(self, value):
31
+ # Prevent manual setting of imageName
32
+ pass
33
+
34
+
35
+ class LiveServerless(LiveServerlessMixin, ServerlessEndpoint):
36
+ """GPU-only live serverless endpoint."""
37
+
38
+ @property
39
+ def _live_image(self) -> str:
40
+ return TETRA_GPU_IMAGE
41
+
42
+ @model_validator(mode="before")
43
+ @classmethod
44
+ def set_live_serverless_template(cls, data: dict):
45
+ """Set default GPU image for Live Serverless."""
46
+ data["imageName"] = TETRA_GPU_IMAGE
47
+ return data
48
+
49
+
50
+ class CpuLiveServerless(LiveServerlessMixin, CpuServerlessEndpoint):
51
+ """CPU-only live serverless endpoint with automatic disk sizing."""
52
+
53
+ @property
54
+ def _live_image(self) -> str:
55
+ return TETRA_CPU_IMAGE
56
+
57
+ @model_validator(mode="before")
58
+ @classmethod
59
+ def set_live_serverless_template(cls, data: dict):
60
+ """Set default CPU image for Live Serverless."""
61
+ data["imageName"] = TETRA_CPU_IMAGE
62
+ return data
@@ -0,0 +1,148 @@
1
+ import hashlib
2
+ import logging
3
+ from enum import Enum
4
+ from typing import Optional
5
+
6
+ from pydantic import (
7
+ Field,
8
+ field_serializer,
9
+ )
10
+
11
+ from ..api.runpod import RunpodRestClient
12
+ from .base import DeployableResource
13
+ from .constants import CONSOLE_BASE_URL
14
+
15
+ log = logging.getLogger(__name__)
16
+
17
+
18
+ class DataCenter(str, Enum):
19
+ """
20
+ Enum representing available data centers for network volumes.
21
+ #TODO: Add more data centers as needed. Lock this to the available data center.
22
+ """
23
+
24
+ EU_RO_1 = "EU-RO-1"
25
+
26
+
27
+ class NetworkVolume(DeployableResource):
28
+ """
29
+ NetworkVolume resource for creating and managing Runpod network volumes.
30
+
31
+ This class handles the creation, deployment, and management of network volumes
32
+ that can be attached to serverless resources. Supports idempotent deployment
33
+ where multiple volumes with the same name will reuse existing volumes.
34
+
35
+ """
36
+
37
+ # Internal fixed value
38
+ dataCenterId: DataCenter = Field(default=DataCenter.EU_RO_1, frozen=True)
39
+
40
+ id: Optional[str] = Field(default=None)
41
+ name: str
42
+ size: Optional[int] = Field(default=100, gt=0) # Size in GB
43
+
44
+ def __str__(self) -> str:
45
+ return f"{self.__class__.__name__}:{self.id}"
46
+
47
+ @property
48
+ def resource_id(self) -> str:
49
+ """Unique resource ID based on name and datacenter for idempotent behavior."""
50
+ # Use name + datacenter to ensure idempotence
51
+ resource_type = self.__class__.__name__
52
+ config_key = f"{self.name}:{self.dataCenterId.value}"
53
+ hash_obj = hashlib.md5(f"{resource_type}:{config_key}".encode())
54
+ return f"{resource_type}_{hash_obj.hexdigest()}"
55
+
56
+ @field_serializer("dataCenterId")
57
+ def serialize_data_center_id(self, value: Optional[DataCenter]) -> Optional[str]:
58
+ """Convert DataCenter enum to string."""
59
+ return value.value if value is not None else None
60
+
61
+ @property
62
+ def is_created(self) -> bool:
63
+ "Returns True if the network volume already exists."
64
+ return self.id is not None
65
+
66
+ @property
67
+ def url(self) -> str:
68
+ """
69
+ Returns the URL for the network volume resource.
70
+ """
71
+ if not self.id:
72
+ raise ValueError("Network volume ID is not set")
73
+ return f"{CONSOLE_BASE_URL}/user/storage"
74
+
75
+ def is_deployed(self) -> bool:
76
+ """
77
+ Checks if the network volume resource is deployed and available.
78
+ """
79
+ return self.id is not None
80
+
81
+ def _normalize_volumes_response(self, volumes_response) -> list:
82
+ """Normalize API response to list format."""
83
+ if isinstance(volumes_response, list):
84
+ return volumes_response
85
+ return volumes_response.get("networkVolumes", [])
86
+
87
+ def _find_matching_volume(self, existing_volumes: list) -> Optional[dict]:
88
+ """Find existing volume matching name and datacenter."""
89
+ for volume_data in existing_volumes:
90
+ if (
91
+ volume_data.get("name") == self.name
92
+ and volume_data.get("dataCenterId") == self.dataCenterId.value
93
+ ):
94
+ return volume_data
95
+ return None
96
+
97
+ async def _find_existing_volume(self, client) -> Optional["NetworkVolume"]:
98
+ """Check for existing volume with same name and datacenter."""
99
+ if not self.name:
100
+ return None
101
+
102
+ log.debug(f"Checking for existing network volume with name: {self.name}")
103
+ volumes_response = await client.list_network_volumes()
104
+ existing_volumes = self._normalize_volumes_response(volumes_response)
105
+
106
+ if matching_volume := self._find_matching_volume(existing_volumes):
107
+ log.info(
108
+ f"Found existing network volume: {matching_volume.get('id')} with name '{self.name}'"
109
+ )
110
+ # Update our instance with the existing volume's ID
111
+ self.id = matching_volume.get("id")
112
+ return self
113
+
114
+ return None
115
+
116
+ async def _create_new_volume(self, client) -> "NetworkVolume":
117
+ """Create a new network volume."""
118
+ log.debug(f"Creating new network volume: {self.name or 'unnamed'}")
119
+ payload = self.model_dump(exclude_none=True)
120
+ result = await client.create_network_volume(payload)
121
+
122
+ if volume := self.__class__(**result):
123
+ return volume
124
+
125
+ raise ValueError("Deployment failed, no volume was created.")
126
+
127
+ async def deploy(self) -> "DeployableResource":
128
+ """
129
+ Deploys the network volume resource using the provided configuration.
130
+ Returns a DeployableResource object.
131
+ """
132
+ try:
133
+ # If the resource is already deployed, return it
134
+ if self.is_deployed():
135
+ log.debug(f"{self} exists")
136
+ return self
137
+
138
+ async with RunpodRestClient() as client:
139
+ # Check for existing volume first
140
+ if existing_volume := await self._find_existing_volume(client):
141
+ return existing_volume
142
+
143
+ # No existing volume found, create a new one
144
+ return await self._create_new_volume(client)
145
+
146
+ except Exception as e:
147
+ log.error(f"{self} failed to deploy: {e}")
148
+ raise
@@ -0,0 +1,145 @@
1
+ import asyncio
2
+ import cloudpickle
3
+ import logging
4
+ from typing import Dict, Optional
5
+ from pathlib import Path
6
+
7
+ from ..exceptions import RunpodAPIKeyError
8
+ from ..utils.singleton import SingletonMixin
9
+ from ..utils.file_lock import file_lock, FileLockError
10
+
11
+ from .base import DeployableResource
12
+
13
+
14
+ log = logging.getLogger(__name__)
15
+
16
+ # File to persist state of resources
17
+ RESOURCE_STATE_FILE = Path(".tetra_resources.pkl")
18
+
19
+
20
+ class ResourceManager(SingletonMixin):
21
+ """Manages dynamic provisioning and tracking of remote resources."""
22
+
23
+ # Class variables shared across all instances (singleton)
24
+ _resources: Dict[str, DeployableResource] = {}
25
+ _deployment_locks: Dict[str, asyncio.Lock] = {}
26
+ _global_lock: Optional[asyncio.Lock] = None
27
+ _lock_initialized = False
28
+ _resources_initialized = False
29
+
30
+ def __init__(self):
31
+ # Ensure async locks are initialized properly for the singleton instance
32
+ if not ResourceManager._lock_initialized:
33
+ ResourceManager._global_lock = asyncio.Lock()
34
+ ResourceManager._lock_initialized = True
35
+
36
+ # Load resources immediately on initialization (only once)
37
+ if not ResourceManager._resources_initialized:
38
+ self._load_resources()
39
+ ResourceManager._resources_initialized = True
40
+
41
+ def _load_resources(self) -> Dict[str, DeployableResource]:
42
+ """Load persisted resource information using cross-platform file locking."""
43
+ if RESOURCE_STATE_FILE.exists():
44
+ try:
45
+ with open(RESOURCE_STATE_FILE, "rb") as f:
46
+ # Acquire shared lock for reading (cross-platform)
47
+ with file_lock(f, exclusive=False):
48
+ self._resources = cloudpickle.load(f)
49
+ log.debug(f"Loaded saved resources from {RESOURCE_STATE_FILE}")
50
+ except (FileLockError, Exception) as e:
51
+ log.error(f"Failed to load resources from {RESOURCE_STATE_FILE}: {e}")
52
+ return self._resources
53
+
54
+ def _save_resources(self) -> None:
55
+ """Persist state of resources to disk using cross-platform file locking."""
56
+ try:
57
+ with open(RESOURCE_STATE_FILE, "wb") as f:
58
+ # Acquire exclusive lock for writing (cross-platform)
59
+ with file_lock(f, exclusive=True):
60
+ cloudpickle.dump(self._resources, f)
61
+ f.flush() # Ensure data is written to disk
62
+ log.debug(f"Saved resources in {RESOURCE_STATE_FILE}")
63
+ except (FileLockError, Exception) as e:
64
+ log.error(f"Failed to save resources to {RESOURCE_STATE_FILE}: {e}")
65
+ raise
66
+
67
+ def add_resource(self, uid: str, resource: DeployableResource):
68
+ """Add a resource to the manager."""
69
+ self._resources[uid] = resource
70
+ self._save_resources()
71
+
72
+ # function to check if resource still exists remotely, else remove it
73
+ def remove_resource(self, uid: str):
74
+ """Remove a resource from the manager."""
75
+ if uid not in self._resources:
76
+ log.warning(f"Resource {uid} not found for removal")
77
+ return
78
+
79
+ del self._resources[uid]
80
+ log.debug(f"Removed resource {uid}")
81
+
82
+ self._save_resources()
83
+
84
+ async def _deploy_with_error_context(
85
+ self, config: DeployableResource
86
+ ) -> DeployableResource:
87
+ """Deploy resource with enhanced error context for RunpodAPIKeyError.
88
+
89
+ Args:
90
+ config: Resource configuration to deploy.
91
+
92
+ Returns:
93
+ Deployed resource instance.
94
+
95
+ Raises:
96
+ RunpodAPIKeyError: If deployment fails due to missing API key, with resource context.
97
+ """
98
+ try:
99
+ return await config.deploy()
100
+ except RunpodAPIKeyError as e:
101
+ error_msg = f"Cannot deploy resource '{config.name}': {str(e)}"
102
+ raise RunpodAPIKeyError(error_msg) from e
103
+
104
+ async def get_or_deploy_resource(
105
+ self, config: DeployableResource
106
+ ) -> DeployableResource:
107
+ """Get existing or create new resource based on config.
108
+
109
+ Thread-safe implementation that prevents concurrent deployments
110
+ of the same resource configuration.
111
+ """
112
+ uid = config.resource_id
113
+
114
+ # Ensure global lock is initialized (should be done in __init__)
115
+ assert ResourceManager._global_lock is not None, "Global lock not initialized"
116
+
117
+ # Get or create a per-resource lock
118
+ async with ResourceManager._global_lock:
119
+ if uid not in ResourceManager._deployment_locks:
120
+ ResourceManager._deployment_locks[uid] = asyncio.Lock()
121
+ resource_lock = ResourceManager._deployment_locks[uid]
122
+
123
+ # Acquire per-resource lock for this specific configuration
124
+ async with resource_lock:
125
+ # Double-check pattern: check again inside the lock
126
+ if existing := self._resources.get(uid):
127
+ if not existing.is_deployed():
128
+ log.warning(f"{existing} is no longer valid, redeploying.")
129
+ self.remove_resource(uid)
130
+ # Don't recursive call - deploy directly within the lock
131
+ deployed_resource = await self._deploy_with_error_context(config)
132
+ log.info(f"URL: {deployed_resource.url}")
133
+ self.add_resource(uid, deployed_resource)
134
+ return deployed_resource
135
+
136
+ log.debug(f"{existing} exists, reusing.")
137
+ log.info(f"URL: {existing.url}")
138
+ return existing
139
+
140
+ # No existing resource, deploy new one
141
+ log.debug(f"Deploying new resource: {uid}")
142
+ deployed_resource = await self._deploy_with_error_context(config)
143
+ log.info(f"URL: {deployed_resource.url}")
144
+ self.add_resource(uid, deployed_resource)
145
+ return deployed_resource