tetra-rp 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tetra-rp might be problematic. Click here for more details.

Files changed (38) hide show
  1. tetra_rp/__init__.py +2 -0
  2. tetra_rp/cli/__init__.py +0 -0
  3. tetra_rp/cli/commands/__init__.py +1 -0
  4. tetra_rp/cli/commands/deploy.py +336 -0
  5. tetra_rp/cli/commands/init.py +86 -0
  6. tetra_rp/cli/commands/resource.py +191 -0
  7. tetra_rp/cli/commands/run.py +122 -0
  8. tetra_rp/cli/main.py +81 -0
  9. tetra_rp/cli/templates/advanced/main.py +58 -0
  10. tetra_rp/cli/templates/advanced/utils.py +24 -0
  11. tetra_rp/cli/templates/basic/main.py +32 -0
  12. tetra_rp/cli/templates/gpu-compute/main.py +64 -0
  13. tetra_rp/cli/templates/web-api/api.py +67 -0
  14. tetra_rp/cli/templates/web-api/main.py +42 -0
  15. tetra_rp/cli/utils/__init__.py +1 -0
  16. tetra_rp/cli/utils/deployment.py +172 -0
  17. tetra_rp/cli/utils/skeleton.py +101 -0
  18. tetra_rp/client.py +0 -6
  19. tetra_rp/config.py +29 -0
  20. tetra_rp/core/resources/__init__.py +3 -2
  21. tetra_rp/core/resources/cpu.py +115 -12
  22. tetra_rp/core/resources/gpu.py +29 -14
  23. tetra_rp/core/resources/live_serverless.py +40 -14
  24. tetra_rp/core/resources/resource_manager.py +63 -22
  25. tetra_rp/core/resources/serverless.py +27 -46
  26. tetra_rp/core/resources/serverless_cpu.py +154 -0
  27. tetra_rp/core/utils/file_lock.py +260 -0
  28. tetra_rp/core/utils/singleton.py +15 -1
  29. tetra_rp/execute_class.py +0 -3
  30. tetra_rp/protos/remote_execution.py +0 -4
  31. tetra_rp/stubs/live_serverless.py +11 -9
  32. tetra_rp/stubs/registry.py +25 -14
  33. {tetra_rp-0.11.0.dist-info → tetra_rp-0.13.0.dist-info}/METADATA +5 -1
  34. tetra_rp-0.13.0.dist-info/RECORD +56 -0
  35. tetra_rp-0.13.0.dist-info/entry_points.txt +2 -0
  36. tetra_rp-0.11.0.dist-info/RECORD +0 -36
  37. {tetra_rp-0.11.0.dist-info → tetra_rp-0.13.0.dist-info}/WHEEL +0 -0
  38. {tetra_rp-0.11.0.dist-info → tetra_rp-0.13.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,101 @@
1
+ """Project skeleton creation utilities."""
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, List, Any
5
+
6
+ from tetra_rp.config import get_paths
7
+
8
+
9
+ def get_template_directory() -> Path:
10
+ """Get the path to the templates directory."""
11
+ return Path(__file__).parent.parent / "templates"
12
+
13
+
14
+ def load_template_files(template_name: str) -> Dict[str, Any]:
15
+ """Load template files from filesystem."""
16
+ template_dir = get_template_directory() / template_name
17
+
18
+ if not template_dir.exists():
19
+ raise ValueError(f"Template '{template_name}' not found in {template_dir}")
20
+
21
+ files = {}
22
+
23
+ # Load all files from the template directory
24
+ for file_path in template_dir.iterdir():
25
+ if file_path.is_file():
26
+ relative_path = file_path.name
27
+
28
+ # Special handling for config.json - return as callable that generates tetra config
29
+ if file_path.name == "config.json":
30
+ config_content = file_path.read_text()
31
+ files[".tetra/config.json"] = lambda content=config_content: content
32
+ else:
33
+ files[relative_path] = file_path.read_text()
34
+
35
+ return files
36
+
37
+
38
+ def get_available_templates() -> Dict[str, Dict[str, Any]]:
39
+ """Get available project templates from filesystem."""
40
+ template_dir = get_template_directory()
41
+ templates = {}
42
+
43
+ # Template descriptions
44
+ descriptions = {
45
+ "basic": "Simple remote function example",
46
+ "advanced": "Multi-function project with dependencies",
47
+ "gpu-compute": "GPU-optimized compute workload",
48
+ "web-api": "FastAPI web service deployment",
49
+ }
50
+
51
+ # Discover templates from filesystem
52
+ for template_path in template_dir.iterdir():
53
+ if template_path.is_dir():
54
+ template_name = template_path.name
55
+ try:
56
+ templates[template_name] = {
57
+ "description": descriptions.get(
58
+ template_name, f"{template_name} template"
59
+ ),
60
+ "files": load_template_files(template_name),
61
+ }
62
+ except Exception as e:
63
+ print(f"Warning: Failed to load template '{template_name}': {e}")
64
+
65
+ return templates
66
+
67
+
68
+ def create_project_skeleton(
69
+ template_name: str, template_info: Dict[str, Any], force: bool = False
70
+ ) -> List[str]:
71
+ """Create project skeleton from template."""
72
+ created_files = []
73
+
74
+ # Create .tetra directory using centralized config
75
+ paths = get_paths()
76
+ paths.ensure_tetra_dir()
77
+
78
+ # Create files from template
79
+ for file_path, content in template_info["files"].items():
80
+ path = Path(file_path)
81
+
82
+ # Create parent directories if needed
83
+ path.parent.mkdir(parents=True, exist_ok=True)
84
+
85
+ # Skip existing files unless force is True
86
+ if path.exists() and not force:
87
+ continue
88
+
89
+ # Get content (could be string or callable)
90
+ if callable(content):
91
+ file_content = content()
92
+ else:
93
+ file_content = content
94
+
95
+ # Write file
96
+ with open(path, "w") as f:
97
+ f.write(file_content)
98
+
99
+ created_files.append(str(path))
100
+
101
+ return created_files
tetra_rp/client.py CHANGED
@@ -15,7 +15,6 @@ def remote(
15
15
  dependencies: Optional[List[str]] = None,
16
16
  system_dependencies: Optional[List[str]] = None,
17
17
  accelerate_downloads: bool = True,
18
- hf_models_to_cache: Optional[List[str]] = None,
19
18
  **extra,
20
19
  ):
21
20
  """
@@ -33,8 +32,6 @@ def remote(
33
32
  environment before executing the function. Defaults to None.
34
33
  accelerate_downloads (bool, optional): Enable download acceleration for dependencies and models.
35
34
  Defaults to True.
36
- hf_models_to_cache (List[str], optional): List of HuggingFace model IDs to pre-cache using
37
- download acceleration. Defaults to None.
38
35
  extra (dict, optional): Additional parameters for the execution of the resource. Defaults to an empty dict.
39
36
 
40
37
  Returns:
@@ -47,7 +44,6 @@ def remote(
47
44
  resource_config=my_resource_config,
48
45
  dependencies=["numpy", "pandas"],
49
46
  accelerate_downloads=True,
50
- hf_models_to_cache=["gpt2", "bert-base-uncased"]
51
47
  )
52
48
  async def my_function(data):
53
49
  # Function logic here
@@ -64,7 +60,6 @@ def remote(
64
60
  dependencies,
65
61
  system_dependencies,
66
62
  accelerate_downloads,
67
- hf_models_to_cache,
68
63
  extra,
69
64
  )
70
65
  else:
@@ -82,7 +77,6 @@ def remote(
82
77
  dependencies,
83
78
  system_dependencies,
84
79
  accelerate_downloads,
85
- hf_models_to_cache,
86
80
  *args,
87
81
  **kwargs,
88
82
  )
tetra_rp/config.py ADDED
@@ -0,0 +1,29 @@
1
+ """Configuration management for tetra-rp CLI."""
2
+
3
+ from pathlib import Path
4
+ from typing import NamedTuple
5
+
6
+
7
+ class TetraPaths(NamedTuple):
8
+ """Paths for tetra-rp configuration and data."""
9
+
10
+ tetra_dir: Path
11
+ config_file: Path
12
+ deployments_file: Path
13
+
14
+ def ensure_tetra_dir(self) -> None:
15
+ """Ensure the .tetra directory exists."""
16
+ self.tetra_dir.mkdir(exist_ok=True)
17
+
18
+
19
+ def get_paths() -> TetraPaths:
20
+ """Get standardized paths for tetra-rp configuration."""
21
+ tetra_dir = Path.cwd() / ".tetra"
22
+ config_file = tetra_dir / "config.json"
23
+ deployments_file = tetra_dir / "deployments.json"
24
+
25
+ return TetraPaths(
26
+ tetra_dir=tetra_dir,
27
+ config_file=config_file,
28
+ deployments_file=deployments_file,
29
+ )
@@ -3,14 +3,14 @@ from .cloud import runpod
3
3
  from .cpu import CpuInstanceType
4
4
  from .gpu import GpuGroup, GpuType, GpuTypeDetail
5
5
  from .resource_manager import ResourceManager
6
- from .live_serverless import LiveServerless
6
+ from .live_serverless import LiveServerless, CpuLiveServerless
7
7
  from .serverless import (
8
- CpuServerlessEndpoint,
9
8
  ServerlessResource,
10
9
  ServerlessEndpoint,
11
10
  JobOutput,
12
11
  CudaVersion,
13
12
  )
13
+ from .serverless_cpu import CpuServerlessEndpoint
14
14
  from .template import PodTemplate
15
15
  from .network_volume import NetworkVolume, DataCenter
16
16
 
@@ -19,6 +19,7 @@ __all__ = [
19
19
  "runpod",
20
20
  "BaseResource",
21
21
  "CpuInstanceType",
22
+ "CpuLiveServerless",
22
23
  "CpuServerlessEndpoint",
23
24
  "CudaVersion",
24
25
  "DataCenter",
@@ -1,4 +1,5 @@
1
1
  from enum import Enum
2
+ from typing import List, Optional
2
3
 
3
4
 
4
5
  class CpuInstanceType(str, Enum):
@@ -16,19 +17,121 @@ class CpuInstanceType(str, Enum):
16
17
  """
17
18
 
18
19
  # 3rd Generation General Purpose (RAM multiplier: 4.0)
19
- CPU3G_1_4 = "cpu3g-1-4" # 1 vCPU, 4GB RAM
20
- CPU3G_2_8 = "cpu3g-2-8" # 2 vCPU, 8GB RAM
21
- CPU3G_4_16 = "cpu3g-4-16" # 4 vCPU, 16GB RAM
22
- CPU3G_8_32 = "cpu3g-8-32" # 8 vCPU, 32GB RAM
20
+
21
+ CPU3G_1_4 = "cpu3g-1-4"
22
+ """1 vCPU, 4GB RAM, max 10GB container disk"""
23
+
24
+ CPU3G_2_8 = "cpu3g-2-8"
25
+ """2 vCPU, 8GB RAM, max 20GB container disk"""
26
+
27
+ CPU3G_4_16 = "cpu3g-4-16"
28
+ """4 vCPU, 16GB RAM, max 40GB container disk"""
29
+
30
+ CPU3G_8_32 = "cpu3g-8-32"
31
+ """8 vCPU, 32GB RAM, max 80GB container disk"""
23
32
 
24
33
  # 3rd Generation Compute-Optimized (RAM multiplier: 2.0)
25
- CPU3C_1_2 = "cpu3c-1-2" # 1 vCPU, 2GB RAM
26
- CPU3C_2_4 = "cpu3c-2-4" # 2 vCPU, 4GB RAM
27
- CPU3C_4_8 = "cpu3c-4-8" # 4 vCPU, 8GB RAM
28
- CPU3C_8_16 = "cpu3c-8-16" # 8 vCPU, 16GB RAM
34
+
35
+ CPU3C_1_2 = "cpu3c-1-2"
36
+ """1 vCPU, 2GB RAM, max 10GB container disk"""
37
+
38
+ CPU3C_2_4 = "cpu3c-2-4"
39
+ """2 vCPU, 4GB RAM, max 20GB container disk"""
40
+
41
+ CPU3C_4_8 = "cpu3c-4-8"
42
+ """4 vCPU, 8GB RAM, max 40GB container disk"""
43
+
44
+ CPU3C_8_16 = "cpu3c-8-16"
45
+ """8 vCPU, 16GB RAM, max 80GB container disk"""
29
46
 
30
47
  # 5th Generation Compute-Optimized (RAM multiplier: 2.0)
31
- CPU5C_1_2 = "cpu5c-1-2" # 1 vCPU, 2GB RAM
32
- CPU5C_2_4 = "cpu5c-2-4" # 2 vCPU, 4GB RAM
33
- CPU5C_4_8 = "cpu5c-4-8" # 4 vCPU, 8GB RAM
34
- CPU5C_8_16 = "cpu5c-8-16" # 8 vCPU, 16GB RAM
48
+
49
+ CPU5C_1_2 = "cpu5c-1-2"
50
+ """1 vCPU, 2GB RAM, max 15GB container disk"""
51
+
52
+ CPU5C_2_4 = "cpu5c-2-4"
53
+ """2 vCPU, 4GB RAM, max 30GB container disk"""
54
+
55
+ CPU5C_4_8 = "cpu5c-4-8"
56
+ """4 vCPU, 8GB RAM, max 60GB container disk"""
57
+
58
+ CPU5C_8_16 = "cpu5c-8-16"
59
+ """8 vCPU, 16GB RAM, max 120GB container disk"""
60
+
61
+
62
+ def calculate_max_disk_size(instance_type: CpuInstanceType) -> int:
63
+ """
64
+ Calculate the maximum container disk size for a CPU instance type.
65
+
66
+ Formula:
67
+ - CPU3G/CPU3C: vCPU count × 10GB
68
+ - CPU5C: vCPU count × 15GB
69
+
70
+ Args:
71
+ instance_type: CPU instance type enum
72
+
73
+ Returns:
74
+ Maximum container disk size in GB
75
+
76
+ Example:
77
+ >>> calculate_max_disk_size(CpuInstanceType.CPU3G_1_4)
78
+ 10
79
+ >>> calculate_max_disk_size(CpuInstanceType.CPU5C_2_4)
80
+ 30
81
+ """
82
+ # Parse the instance type string to extract vCPU count
83
+ # Format: "cpu{generation}{type}-{vcpu}-{memory}"
84
+ instance_str = instance_type.value
85
+ parts = instance_str.split("-")
86
+
87
+ if len(parts) != 3:
88
+ raise ValueError(f"Invalid instance type format: {instance_str}")
89
+
90
+ vcpu_count = int(parts[1])
91
+
92
+ # Determine disk multiplier based on generation
93
+ if instance_str.startswith("cpu5c"):
94
+ disk_multiplier = 15 # CPU5C: 15GB per vCPU
95
+ elif instance_str.startswith(("cpu3g", "cpu3c")):
96
+ disk_multiplier = 10 # CPU3G/CPU3C: 10GB per vCPU
97
+ else:
98
+ raise ValueError(f"Unknown CPU generation/type: {instance_str}")
99
+
100
+ return vcpu_count * disk_multiplier
101
+
102
+
103
+ # CPU Instance Type Disk Limits (calculated programmatically)
104
+ CPU_INSTANCE_DISK_LIMITS = {
105
+ instance_type: calculate_max_disk_size(instance_type)
106
+ for instance_type in CpuInstanceType
107
+ }
108
+
109
+
110
+ def get_max_disk_size_for_instances(
111
+ instance_types: Optional[List[CpuInstanceType]],
112
+ ) -> Optional[int]:
113
+ """
114
+ Calculate the maximum container disk size for a list of CPU instance types.
115
+
116
+ Returns the minimum disk limit across all instance types to ensure compatibility
117
+ with all specified instances.
118
+
119
+ Args:
120
+ instance_types: List of CPU instance types, or None
121
+
122
+ Returns:
123
+ Maximum allowed disk size in GB, or None if no CPU instances specified
124
+
125
+ Example:
126
+ >>> get_max_disk_size_for_instances([CpuInstanceType.CPU3G_1_4])
127
+ 10
128
+ >>> get_max_disk_size_for_instances([CpuInstanceType.CPU3G_1_4, CpuInstanceType.CPU3G_2_8])
129
+ 10
130
+ """
131
+ if not instance_types:
132
+ return None
133
+
134
+ disk_limits = [
135
+ CPU_INSTANCE_DISK_LIMITS[instance_type] for instance_type in instance_types
136
+ ]
137
+ return min(disk_limits)
@@ -32,20 +32,35 @@ class GpuTypeDetail(GpuType):
32
32
 
33
33
  # TODO: this should be fetched from an API
34
34
  class GpuGroup(Enum):
35
- ANY = "any" # "Any GPU"
36
- ADA_24 = "ADA_24" # "NVIDIA GeForce RTX 4090"
37
- ADA_32_PRO = "ADA_32_PRO" # "NVIDIA GeForce RTX 5090"
38
- ADA_48_PRO = (
39
- "ADA_48_PRO" # "NVIDIA RTX 6000 Ada Generation, NVIDIA L40, NVIDIA L40S"
40
- )
41
- ADA_80_PRO = (
42
- "ADA_80_PRO" # "NVIDIA H100 PCIe, NVIDIA H100 80GB HBM3, NVIDIA H100 NVL"
43
- )
44
- AMPERE_16 = "AMPERE_16" # "NVIDIA RTX A4000, NVIDIA RTX A4500, NVIDIA RTX 4000 Ada Generation, NVIDIA RTX 2000 Ada Generation"
45
- AMPERE_24 = "AMPERE_24" # "NVIDIA RTX A5000, NVIDIA L4, NVIDIA GeForce RTX 3090"
46
- AMPERE_48 = "AMPERE_48" # "NVIDIA A40, NVIDIA RTX A6000"
47
- AMPERE_80 = "AMPERE_80" # "NVIDIA A100 80GB PCIe, NVIDIA A100-SXM4-80GB"
48
- HOPPER_141 = "HOPPER_141" # "NVIDIA H200"
35
+ ANY = "any"
36
+ """Any GPU"""
37
+
38
+ ADA_24 = "ADA_24"
39
+ """NVIDIA GeForce RTX 4090"""
40
+
41
+ ADA_32_PRO = "ADA_32_PRO"
42
+ """NVIDIA GeForce RTX 5090"""
43
+
44
+ ADA_48_PRO = "ADA_48_PRO"
45
+ """NVIDIA RTX 6000 Ada Generation, NVIDIA L40, NVIDIA L40S"""
46
+
47
+ ADA_80_PRO = "ADA_80_PRO"
48
+ """NVIDIA H100 PCIe, NVIDIA H100 80GB HBM3, NVIDIA H100 NVL"""
49
+
50
+ AMPERE_16 = "AMPERE_16"
51
+ """NVIDIA RTX A4000, NVIDIA RTX A4500, NVIDIA RTX 4000 Ada Generation, NVIDIA RTX 2000 Ada Generation"""
52
+
53
+ AMPERE_24 = "AMPERE_24"
54
+ """NVIDIA RTX A5000, NVIDIA L4, NVIDIA GeForce RTX 3090"""
55
+
56
+ AMPERE_48 = "AMPERE_48"
57
+ """NVIDIA A40, NVIDIA RTX A6000"""
58
+
59
+ AMPERE_80 = "AMPERE_80"
60
+ """NVIDIA A100 80GB PCIe, NVIDIA A100-SXM4-80GB"""
61
+
62
+ HOPPER_141 = "HOPPER_141"
63
+ """NVIDIA H200"""
49
64
 
50
65
  @classmethod
51
66
  def all(cls) -> List["GpuGroup"]:
@@ -2,6 +2,7 @@
2
2
  import os
3
3
  from pydantic import model_validator
4
4
  from .serverless import ServerlessEndpoint
5
+ from .serverless_cpu import CpuServerlessEndpoint
5
6
 
6
7
  TETRA_IMAGE_TAG = os.environ.get("TETRA_IMAGE_TAG", "latest")
7
8
  TETRA_GPU_IMAGE = os.environ.get(
@@ -12,25 +13,50 @@ TETRA_CPU_IMAGE = os.environ.get(
12
13
  )
13
14
 
14
15
 
15
- class LiveServerless(ServerlessEndpoint):
16
- @model_validator(mode="before")
17
- @classmethod
18
- def set_live_serverless_template(cls, data: dict):
19
- """Set default templates for Live Serverless. This can't be changed."""
20
- # Always set imageName based on instanceIds presence
21
- data["imageName"] = (
22
- TETRA_CPU_IMAGE if data.get("instanceIds") else TETRA_GPU_IMAGE
23
- )
24
- return data
16
+ class LiveServerlessMixin:
17
+ """Common mixin for live serverless endpoints that locks the image."""
18
+
19
+ @property
20
+ def _live_image(self) -> str:
21
+ """Override in subclasses to specify the locked image."""
22
+ raise NotImplementedError("Subclasses must define _live_image")
25
23
 
26
24
  @property
27
25
  def imageName(self):
28
- # Lock imageName to always reflect instanceIds
29
- return (
30
- TETRA_CPU_IMAGE if getattr(self, "instanceIds", None) else TETRA_GPU_IMAGE
31
- )
26
+ # Lock imageName to specific image
27
+ return self._live_image
32
28
 
33
29
  @imageName.setter
34
30
  def imageName(self, value):
35
31
  # Prevent manual setting of imageName
36
32
  pass
33
+
34
+
35
+ class LiveServerless(LiveServerlessMixin, ServerlessEndpoint):
36
+ """GPU-only live serverless endpoint."""
37
+
38
+ @property
39
+ def _live_image(self) -> str:
40
+ return TETRA_GPU_IMAGE
41
+
42
+ @model_validator(mode="before")
43
+ @classmethod
44
+ def set_live_serverless_template(cls, data: dict):
45
+ """Set default GPU image for Live Serverless."""
46
+ data["imageName"] = TETRA_GPU_IMAGE
47
+ return data
48
+
49
+
50
+ class CpuLiveServerless(LiveServerlessMixin, CpuServerlessEndpoint):
51
+ """CPU-only live serverless endpoint with automatic disk sizing."""
52
+
53
+ @property
54
+ def _live_image(self) -> str:
55
+ return TETRA_CPU_IMAGE
56
+
57
+ @model_validator(mode="before")
58
+ @classmethod
59
+ def set_live_serverless_template(cls, data: dict):
60
+ """Set default CPU image for Live Serverless."""
61
+ data["imageName"] = TETRA_CPU_IMAGE
62
+ return data
@@ -1,9 +1,11 @@
1
+ import asyncio
1
2
  import cloudpickle
2
3
  import logging
3
- from typing import Dict
4
+ from typing import Dict, Optional
4
5
  from pathlib import Path
5
6
 
6
7
  from ..utils.singleton import SingletonMixin
8
+ from ..utils.file_lock import file_lock, FileLockError
7
9
 
8
10
  from .base import DeployableResource
9
11
 
@@ -17,28 +19,46 @@ RESOURCE_STATE_FILE = Path(".tetra_resources.pkl")
17
19
  class ResourceManager(SingletonMixin):
18
20
  """Manages dynamic provisioning and tracking of remote resources."""
19
21
 
22
+ # Class variables shared across all instances (singleton)
20
23
  _resources: Dict[str, DeployableResource] = {}
24
+ _deployment_locks: Dict[str, asyncio.Lock] = {}
25
+ _global_lock: Optional[asyncio.Lock] = None # Will be initialized lazily
26
+ _lock_initialized = False
21
27
 
22
28
  def __init__(self):
29
+ # Ensure async locks are initialized properly for the singleton instance
30
+ if not ResourceManager._lock_initialized:
31
+ ResourceManager._global_lock = asyncio.Lock()
32
+ ResourceManager._lock_initialized = True
33
+
23
34
  if not self._resources:
24
35
  self._load_resources()
25
36
 
26
37
  def _load_resources(self) -> Dict[str, DeployableResource]:
27
- """Load persisted resource information using cloudpickle."""
38
+ """Load persisted resource information using cross-platform file locking."""
28
39
  if RESOURCE_STATE_FILE.exists():
29
40
  try:
30
41
  with open(RESOURCE_STATE_FILE, "rb") as f:
31
- self._resources = cloudpickle.load(f)
32
- log.debug(f"Loaded saved resources from {RESOURCE_STATE_FILE}")
33
- except Exception as e:
42
+ # Acquire shared lock for reading (cross-platform)
43
+ with file_lock(f, exclusive=False):
44
+ self._resources = cloudpickle.load(f)
45
+ log.debug(f"Loaded saved resources from {RESOURCE_STATE_FILE}")
46
+ except (FileLockError, Exception) as e:
34
47
  log.error(f"Failed to load resources from {RESOURCE_STATE_FILE}: {e}")
35
48
  return self._resources
36
49
 
37
50
  def _save_resources(self) -> None:
38
- """Persist state of resources to disk using cloudpickle."""
39
- with open(RESOURCE_STATE_FILE, "wb") as f:
40
- cloudpickle.dump(self._resources, f)
41
- log.debug(f"Saved resources in {RESOURCE_STATE_FILE}")
51
+ """Persist state of resources to disk using cross-platform file locking."""
52
+ try:
53
+ with open(RESOURCE_STATE_FILE, "wb") as f:
54
+ # Acquire exclusive lock for writing (cross-platform)
55
+ with file_lock(f, exclusive=True):
56
+ cloudpickle.dump(self._resources, f)
57
+ f.flush() # Ensure data is written to disk
58
+ log.debug(f"Saved resources in {RESOURCE_STATE_FILE}")
59
+ except (FileLockError, Exception) as e:
60
+ log.error(f"Failed to save resources to {RESOURCE_STATE_FILE}: {e}")
61
+ raise
42
62
 
43
63
  def add_resource(self, uid: str, resource: DeployableResource):
44
64
  """Add a resource to the manager."""
@@ -60,21 +80,42 @@ class ResourceManager(SingletonMixin):
60
80
  async def get_or_deploy_resource(
61
81
  self, config: DeployableResource
62
82
  ) -> DeployableResource:
63
- """Get existing or create new resource based on config."""
64
- uid = config.resource_id
65
- if existing := self._resources.get(uid):
66
- if not existing.is_deployed():
67
- log.warning(f"{existing} is no longer valid, redeploying.")
68
- self.remove_resource(uid)
69
- return await self.get_or_deploy_resource(config)
83
+ """Get existing or create new resource based on config.
70
84
 
71
- log.debug(f"{existing} exists, reusing.")
72
- log.info(f"URL: {existing.url}")
73
- return existing
85
+ Thread-safe implementation that prevents concurrent deployments
86
+ of the same resource configuration.
87
+ """
88
+ uid = config.resource_id
74
89
 
75
- if deployed_resource := await config.deploy():
90
+ # Ensure global lock is initialized (should be done in __init__)
91
+ assert ResourceManager._global_lock is not None, "Global lock not initialized"
92
+
93
+ # Get or create a per-resource lock
94
+ async with ResourceManager._global_lock:
95
+ if uid not in ResourceManager._deployment_locks:
96
+ ResourceManager._deployment_locks[uid] = asyncio.Lock()
97
+ resource_lock = ResourceManager._deployment_locks[uid]
98
+
99
+ # Acquire per-resource lock for this specific configuration
100
+ async with resource_lock:
101
+ # Double-check pattern: check again inside the lock
102
+ if existing := self._resources.get(uid):
103
+ if not existing.is_deployed():
104
+ log.warning(f"{existing} is no longer valid, redeploying.")
105
+ self.remove_resource(uid)
106
+ # Don't recursive call - deploy directly within the lock
107
+ deployed_resource = await config.deploy()
108
+ log.info(f"URL: {deployed_resource.url}")
109
+ self.add_resource(uid, deployed_resource)
110
+ return deployed_resource
111
+
112
+ log.debug(f"{existing} exists, reusing.")
113
+ log.info(f"URL: {existing.url}")
114
+ return existing
115
+
116
+ # No existing resource, deploy new one
117
+ log.debug(f"Deploying new resource: {uid}")
118
+ deployed_resource = await config.deploy()
76
119
  log.info(f"URL: {deployed_resource.url}")
77
120
  self.add_resource(uid, deployed_resource)
78
121
  return deployed_resource
79
-
80
- raise RuntimeError(f"Deployment failed for resource {uid}")