tetra-rp 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tetra-rp might be problematic. Click here for more details.
- tetra_rp/__init__.py +2 -0
- tetra_rp/core/resources/__init__.py +3 -2
- tetra_rp/core/resources/cpu.py +115 -12
- tetra_rp/core/resources/gpu.py +29 -14
- tetra_rp/core/resources/live_serverless.py +40 -14
- tetra_rp/core/resources/resource_manager.py +63 -22
- tetra_rp/core/resources/serverless.py +27 -46
- tetra_rp/core/resources/serverless_cpu.py +154 -0
- tetra_rp/core/utils/file_lock.py +260 -0
- tetra_rp/core/utils/singleton.py +15 -1
- tetra_rp/stubs/live_serverless.py +10 -6
- tetra_rp/stubs/registry.py +27 -12
- {tetra_rp-0.11.0.dist-info → tetra_rp-0.12.0.dist-info}/METADATA +1 -1
- {tetra_rp-0.11.0.dist-info → tetra_rp-0.12.0.dist-info}/RECORD +16 -14
- {tetra_rp-0.11.0.dist-info → tetra_rp-0.12.0.dist-info}/WHEEL +0 -0
- {tetra_rp-0.11.0.dist-info → tetra_rp-0.12.0.dist-info}/top_level.txt +0 -0
tetra_rp/__init__.py
CHANGED
|
@@ -13,6 +13,7 @@ from .client import remote # noqa: E402
|
|
|
13
13
|
from .core.resources import ( # noqa: E402
|
|
14
14
|
CpuServerlessEndpoint,
|
|
15
15
|
CpuInstanceType,
|
|
16
|
+
CpuLiveServerless,
|
|
16
17
|
CudaVersion,
|
|
17
18
|
DataCenter,
|
|
18
19
|
GpuGroup,
|
|
@@ -29,6 +30,7 @@ __all__ = [
|
|
|
29
30
|
"remote",
|
|
30
31
|
"CpuServerlessEndpoint",
|
|
31
32
|
"CpuInstanceType",
|
|
33
|
+
"CpuLiveServerless",
|
|
32
34
|
"CudaVersion",
|
|
33
35
|
"DataCenter",
|
|
34
36
|
"GpuGroup",
|
|
@@ -3,14 +3,14 @@ from .cloud import runpod
|
|
|
3
3
|
from .cpu import CpuInstanceType
|
|
4
4
|
from .gpu import GpuGroup, GpuType, GpuTypeDetail
|
|
5
5
|
from .resource_manager import ResourceManager
|
|
6
|
-
from .live_serverless import LiveServerless
|
|
6
|
+
from .live_serverless import LiveServerless, CpuLiveServerless
|
|
7
7
|
from .serverless import (
|
|
8
|
-
CpuServerlessEndpoint,
|
|
9
8
|
ServerlessResource,
|
|
10
9
|
ServerlessEndpoint,
|
|
11
10
|
JobOutput,
|
|
12
11
|
CudaVersion,
|
|
13
12
|
)
|
|
13
|
+
from .serverless_cpu import CpuServerlessEndpoint
|
|
14
14
|
from .template import PodTemplate
|
|
15
15
|
from .network_volume import NetworkVolume, DataCenter
|
|
16
16
|
|
|
@@ -19,6 +19,7 @@ __all__ = [
|
|
|
19
19
|
"runpod",
|
|
20
20
|
"BaseResource",
|
|
21
21
|
"CpuInstanceType",
|
|
22
|
+
"CpuLiveServerless",
|
|
22
23
|
"CpuServerlessEndpoint",
|
|
23
24
|
"CudaVersion",
|
|
24
25
|
"DataCenter",
|
tetra_rp/core/resources/cpu.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
+
from typing import List, Optional
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
class CpuInstanceType(str, Enum):
|
|
@@ -16,19 +17,121 @@ class CpuInstanceType(str, Enum):
|
|
|
16
17
|
"""
|
|
17
18
|
|
|
18
19
|
# 3rd Generation General Purpose (RAM multiplier: 4.0)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
|
|
21
|
+
CPU3G_1_4 = "cpu3g-1-4"
|
|
22
|
+
"""1 vCPU, 4GB RAM, max 10GB container disk"""
|
|
23
|
+
|
|
24
|
+
CPU3G_2_8 = "cpu3g-2-8"
|
|
25
|
+
"""2 vCPU, 8GB RAM, max 20GB container disk"""
|
|
26
|
+
|
|
27
|
+
CPU3G_4_16 = "cpu3g-4-16"
|
|
28
|
+
"""4 vCPU, 16GB RAM, max 40GB container disk"""
|
|
29
|
+
|
|
30
|
+
CPU3G_8_32 = "cpu3g-8-32"
|
|
31
|
+
"""8 vCPU, 32GB RAM, max 80GB container disk"""
|
|
23
32
|
|
|
24
33
|
# 3rd Generation Compute-Optimized (RAM multiplier: 2.0)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
34
|
+
|
|
35
|
+
CPU3C_1_2 = "cpu3c-1-2"
|
|
36
|
+
"""1 vCPU, 2GB RAM, max 10GB container disk"""
|
|
37
|
+
|
|
38
|
+
CPU3C_2_4 = "cpu3c-2-4"
|
|
39
|
+
"""2 vCPU, 4GB RAM, max 20GB container disk"""
|
|
40
|
+
|
|
41
|
+
CPU3C_4_8 = "cpu3c-4-8"
|
|
42
|
+
"""4 vCPU, 8GB RAM, max 40GB container disk"""
|
|
43
|
+
|
|
44
|
+
CPU3C_8_16 = "cpu3c-8-16"
|
|
45
|
+
"""8 vCPU, 16GB RAM, max 80GB container disk"""
|
|
29
46
|
|
|
30
47
|
# 5th Generation Compute-Optimized (RAM multiplier: 2.0)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
48
|
+
|
|
49
|
+
CPU5C_1_2 = "cpu5c-1-2"
|
|
50
|
+
"""1 vCPU, 2GB RAM, max 15GB container disk"""
|
|
51
|
+
|
|
52
|
+
CPU5C_2_4 = "cpu5c-2-4"
|
|
53
|
+
"""2 vCPU, 4GB RAM, max 30GB container disk"""
|
|
54
|
+
|
|
55
|
+
CPU5C_4_8 = "cpu5c-4-8"
|
|
56
|
+
"""4 vCPU, 8GB RAM, max 60GB container disk"""
|
|
57
|
+
|
|
58
|
+
CPU5C_8_16 = "cpu5c-8-16"
|
|
59
|
+
"""8 vCPU, 16GB RAM, max 120GB container disk"""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def calculate_max_disk_size(instance_type: CpuInstanceType) -> int:
|
|
63
|
+
"""
|
|
64
|
+
Calculate the maximum container disk size for a CPU instance type.
|
|
65
|
+
|
|
66
|
+
Formula:
|
|
67
|
+
- CPU3G/CPU3C: vCPU count × 10GB
|
|
68
|
+
- CPU5C: vCPU count × 15GB
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
instance_type: CPU instance type enum
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Maximum container disk size in GB
|
|
75
|
+
|
|
76
|
+
Example:
|
|
77
|
+
>>> calculate_max_disk_size(CpuInstanceType.CPU3G_1_4)
|
|
78
|
+
10
|
|
79
|
+
>>> calculate_max_disk_size(CpuInstanceType.CPU5C_2_4)
|
|
80
|
+
30
|
|
81
|
+
"""
|
|
82
|
+
# Parse the instance type string to extract vCPU count
|
|
83
|
+
# Format: "cpu{generation}{type}-{vcpu}-{memory}"
|
|
84
|
+
instance_str = instance_type.value
|
|
85
|
+
parts = instance_str.split("-")
|
|
86
|
+
|
|
87
|
+
if len(parts) != 3:
|
|
88
|
+
raise ValueError(f"Invalid instance type format: {instance_str}")
|
|
89
|
+
|
|
90
|
+
vcpu_count = int(parts[1])
|
|
91
|
+
|
|
92
|
+
# Determine disk multiplier based on generation
|
|
93
|
+
if instance_str.startswith("cpu5c"):
|
|
94
|
+
disk_multiplier = 15 # CPU5C: 15GB per vCPU
|
|
95
|
+
elif instance_str.startswith(("cpu3g", "cpu3c")):
|
|
96
|
+
disk_multiplier = 10 # CPU3G/CPU3C: 10GB per vCPU
|
|
97
|
+
else:
|
|
98
|
+
raise ValueError(f"Unknown CPU generation/type: {instance_str}")
|
|
99
|
+
|
|
100
|
+
return vcpu_count * disk_multiplier
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# CPU Instance Type Disk Limits (calculated programmatically)
|
|
104
|
+
CPU_INSTANCE_DISK_LIMITS = {
|
|
105
|
+
instance_type: calculate_max_disk_size(instance_type)
|
|
106
|
+
for instance_type in CpuInstanceType
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def get_max_disk_size_for_instances(
|
|
111
|
+
instance_types: Optional[List[CpuInstanceType]],
|
|
112
|
+
) -> Optional[int]:
|
|
113
|
+
"""
|
|
114
|
+
Calculate the maximum container disk size for a list of CPU instance types.
|
|
115
|
+
|
|
116
|
+
Returns the minimum disk limit across all instance types to ensure compatibility
|
|
117
|
+
with all specified instances.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
instance_types: List of CPU instance types, or None
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Maximum allowed disk size in GB, or None if no CPU instances specified
|
|
124
|
+
|
|
125
|
+
Example:
|
|
126
|
+
>>> get_max_disk_size_for_instances([CpuInstanceType.CPU3G_1_4])
|
|
127
|
+
10
|
|
128
|
+
>>> get_max_disk_size_for_instances([CpuInstanceType.CPU3G_1_4, CpuInstanceType.CPU3G_2_8])
|
|
129
|
+
10
|
|
130
|
+
"""
|
|
131
|
+
if not instance_types:
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
disk_limits = [
|
|
135
|
+
CPU_INSTANCE_DISK_LIMITS[instance_type] for instance_type in instance_types
|
|
136
|
+
]
|
|
137
|
+
return min(disk_limits)
|
tetra_rp/core/resources/gpu.py
CHANGED
|
@@ -32,20 +32,35 @@ class GpuTypeDetail(GpuType):
|
|
|
32
32
|
|
|
33
33
|
# TODO: this should be fetched from an API
|
|
34
34
|
class GpuGroup(Enum):
|
|
35
|
-
ANY = "any"
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
35
|
+
ANY = "any"
|
|
36
|
+
"""Any GPU"""
|
|
37
|
+
|
|
38
|
+
ADA_24 = "ADA_24"
|
|
39
|
+
"""NVIDIA GeForce RTX 4090"""
|
|
40
|
+
|
|
41
|
+
ADA_32_PRO = "ADA_32_PRO"
|
|
42
|
+
"""NVIDIA GeForce RTX 5090"""
|
|
43
|
+
|
|
44
|
+
ADA_48_PRO = "ADA_48_PRO"
|
|
45
|
+
"""NVIDIA RTX 6000 Ada Generation, NVIDIA L40, NVIDIA L40S"""
|
|
46
|
+
|
|
47
|
+
ADA_80_PRO = "ADA_80_PRO"
|
|
48
|
+
"""NVIDIA H100 PCIe, NVIDIA H100 80GB HBM3, NVIDIA H100 NVL"""
|
|
49
|
+
|
|
50
|
+
AMPERE_16 = "AMPERE_16"
|
|
51
|
+
"""NVIDIA RTX A4000, NVIDIA RTX A4500, NVIDIA RTX 4000 Ada Generation, NVIDIA RTX 2000 Ada Generation"""
|
|
52
|
+
|
|
53
|
+
AMPERE_24 = "AMPERE_24"
|
|
54
|
+
"""NVIDIA RTX A5000, NVIDIA L4, NVIDIA GeForce RTX 3090"""
|
|
55
|
+
|
|
56
|
+
AMPERE_48 = "AMPERE_48"
|
|
57
|
+
"""NVIDIA A40, NVIDIA RTX A6000"""
|
|
58
|
+
|
|
59
|
+
AMPERE_80 = "AMPERE_80"
|
|
60
|
+
"""NVIDIA A100 80GB PCIe, NVIDIA A100-SXM4-80GB"""
|
|
61
|
+
|
|
62
|
+
HOPPER_141 = "HOPPER_141"
|
|
63
|
+
"""NVIDIA H200"""
|
|
49
64
|
|
|
50
65
|
@classmethod
|
|
51
66
|
def all(cls) -> List["GpuGroup"]:
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import os
|
|
3
3
|
from pydantic import model_validator
|
|
4
4
|
from .serverless import ServerlessEndpoint
|
|
5
|
+
from .serverless_cpu import CpuServerlessEndpoint
|
|
5
6
|
|
|
6
7
|
TETRA_IMAGE_TAG = os.environ.get("TETRA_IMAGE_TAG", "latest")
|
|
7
8
|
TETRA_GPU_IMAGE = os.environ.get(
|
|
@@ -12,25 +13,50 @@ TETRA_CPU_IMAGE = os.environ.get(
|
|
|
12
13
|
)
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
class
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
TETRA_CPU_IMAGE if data.get("instanceIds") else TETRA_GPU_IMAGE
|
|
23
|
-
)
|
|
24
|
-
return data
|
|
16
|
+
class LiveServerlessMixin:
|
|
17
|
+
"""Common mixin for live serverless endpoints that locks the image."""
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def _live_image(self) -> str:
|
|
21
|
+
"""Override in subclasses to specify the locked image."""
|
|
22
|
+
raise NotImplementedError("Subclasses must define _live_image")
|
|
25
23
|
|
|
26
24
|
@property
|
|
27
25
|
def imageName(self):
|
|
28
|
-
# Lock imageName to
|
|
29
|
-
return
|
|
30
|
-
TETRA_CPU_IMAGE if getattr(self, "instanceIds", None) else TETRA_GPU_IMAGE
|
|
31
|
-
)
|
|
26
|
+
# Lock imageName to specific image
|
|
27
|
+
return self._live_image
|
|
32
28
|
|
|
33
29
|
@imageName.setter
|
|
34
30
|
def imageName(self, value):
|
|
35
31
|
# Prevent manual setting of imageName
|
|
36
32
|
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class LiveServerless(LiveServerlessMixin, ServerlessEndpoint):
|
|
36
|
+
"""GPU-only live serverless endpoint."""
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def _live_image(self) -> str:
|
|
40
|
+
return TETRA_GPU_IMAGE
|
|
41
|
+
|
|
42
|
+
@model_validator(mode="before")
|
|
43
|
+
@classmethod
|
|
44
|
+
def set_live_serverless_template(cls, data: dict):
|
|
45
|
+
"""Set default GPU image for Live Serverless."""
|
|
46
|
+
data["imageName"] = TETRA_GPU_IMAGE
|
|
47
|
+
return data
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class CpuLiveServerless(LiveServerlessMixin, CpuServerlessEndpoint):
|
|
51
|
+
"""CPU-only live serverless endpoint with automatic disk sizing."""
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def _live_image(self) -> str:
|
|
55
|
+
return TETRA_CPU_IMAGE
|
|
56
|
+
|
|
57
|
+
@model_validator(mode="before")
|
|
58
|
+
@classmethod
|
|
59
|
+
def set_live_serverless_template(cls, data: dict):
|
|
60
|
+
"""Set default CPU image for Live Serverless."""
|
|
61
|
+
data["imageName"] = TETRA_CPU_IMAGE
|
|
62
|
+
return data
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import cloudpickle
|
|
2
3
|
import logging
|
|
3
|
-
from typing import Dict
|
|
4
|
+
from typing import Dict, Optional
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from ..utils.singleton import SingletonMixin
|
|
8
|
+
from ..utils.file_lock import file_lock, FileLockError
|
|
7
9
|
|
|
8
10
|
from .base import DeployableResource
|
|
9
11
|
|
|
@@ -17,28 +19,46 @@ RESOURCE_STATE_FILE = Path(".tetra_resources.pkl")
|
|
|
17
19
|
class ResourceManager(SingletonMixin):
|
|
18
20
|
"""Manages dynamic provisioning and tracking of remote resources."""
|
|
19
21
|
|
|
22
|
+
# Class variables shared across all instances (singleton)
|
|
20
23
|
_resources: Dict[str, DeployableResource] = {}
|
|
24
|
+
_deployment_locks: Dict[str, asyncio.Lock] = {}
|
|
25
|
+
_global_lock: Optional[asyncio.Lock] = None # Will be initialized lazily
|
|
26
|
+
_lock_initialized = False
|
|
21
27
|
|
|
22
28
|
def __init__(self):
|
|
29
|
+
# Ensure async locks are initialized properly for the singleton instance
|
|
30
|
+
if not ResourceManager._lock_initialized:
|
|
31
|
+
ResourceManager._global_lock = asyncio.Lock()
|
|
32
|
+
ResourceManager._lock_initialized = True
|
|
33
|
+
|
|
23
34
|
if not self._resources:
|
|
24
35
|
self._load_resources()
|
|
25
36
|
|
|
26
37
|
def _load_resources(self) -> Dict[str, DeployableResource]:
|
|
27
|
-
"""Load persisted resource information using
|
|
38
|
+
"""Load persisted resource information using cross-platform file locking."""
|
|
28
39
|
if RESOURCE_STATE_FILE.exists():
|
|
29
40
|
try:
|
|
30
41
|
with open(RESOURCE_STATE_FILE, "rb") as f:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
42
|
+
# Acquire shared lock for reading (cross-platform)
|
|
43
|
+
with file_lock(f, exclusive=False):
|
|
44
|
+
self._resources = cloudpickle.load(f)
|
|
45
|
+
log.debug(f"Loaded saved resources from {RESOURCE_STATE_FILE}")
|
|
46
|
+
except (FileLockError, Exception) as e:
|
|
34
47
|
log.error(f"Failed to load resources from {RESOURCE_STATE_FILE}: {e}")
|
|
35
48
|
return self._resources
|
|
36
49
|
|
|
37
50
|
def _save_resources(self) -> None:
|
|
38
|
-
"""Persist state of resources to disk using
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
51
|
+
"""Persist state of resources to disk using cross-platform file locking."""
|
|
52
|
+
try:
|
|
53
|
+
with open(RESOURCE_STATE_FILE, "wb") as f:
|
|
54
|
+
# Acquire exclusive lock for writing (cross-platform)
|
|
55
|
+
with file_lock(f, exclusive=True):
|
|
56
|
+
cloudpickle.dump(self._resources, f)
|
|
57
|
+
f.flush() # Ensure data is written to disk
|
|
58
|
+
log.debug(f"Saved resources in {RESOURCE_STATE_FILE}")
|
|
59
|
+
except (FileLockError, Exception) as e:
|
|
60
|
+
log.error(f"Failed to save resources to {RESOURCE_STATE_FILE}: {e}")
|
|
61
|
+
raise
|
|
42
62
|
|
|
43
63
|
def add_resource(self, uid: str, resource: DeployableResource):
|
|
44
64
|
"""Add a resource to the manager."""
|
|
@@ -60,21 +80,42 @@ class ResourceManager(SingletonMixin):
|
|
|
60
80
|
async def get_or_deploy_resource(
|
|
61
81
|
self, config: DeployableResource
|
|
62
82
|
) -> DeployableResource:
|
|
63
|
-
"""Get existing or create new resource based on config.
|
|
64
|
-
uid = config.resource_id
|
|
65
|
-
if existing := self._resources.get(uid):
|
|
66
|
-
if not existing.is_deployed():
|
|
67
|
-
log.warning(f"{existing} is no longer valid, redeploying.")
|
|
68
|
-
self.remove_resource(uid)
|
|
69
|
-
return await self.get_or_deploy_resource(config)
|
|
83
|
+
"""Get existing or create new resource based on config.
|
|
70
84
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
85
|
+
Thread-safe implementation that prevents concurrent deployments
|
|
86
|
+
of the same resource configuration.
|
|
87
|
+
"""
|
|
88
|
+
uid = config.resource_id
|
|
74
89
|
|
|
75
|
-
|
|
90
|
+
# Ensure global lock is initialized (should be done in __init__)
|
|
91
|
+
assert ResourceManager._global_lock is not None, "Global lock not initialized"
|
|
92
|
+
|
|
93
|
+
# Get or create a per-resource lock
|
|
94
|
+
async with ResourceManager._global_lock:
|
|
95
|
+
if uid not in ResourceManager._deployment_locks:
|
|
96
|
+
ResourceManager._deployment_locks[uid] = asyncio.Lock()
|
|
97
|
+
resource_lock = ResourceManager._deployment_locks[uid]
|
|
98
|
+
|
|
99
|
+
# Acquire per-resource lock for this specific configuration
|
|
100
|
+
async with resource_lock:
|
|
101
|
+
# Double-check pattern: check again inside the lock
|
|
102
|
+
if existing := self._resources.get(uid):
|
|
103
|
+
if not existing.is_deployed():
|
|
104
|
+
log.warning(f"{existing} is no longer valid, redeploying.")
|
|
105
|
+
self.remove_resource(uid)
|
|
106
|
+
# Don't recursive call - deploy directly within the lock
|
|
107
|
+
deployed_resource = await config.deploy()
|
|
108
|
+
log.info(f"URL: {deployed_resource.url}")
|
|
109
|
+
self.add_resource(uid, deployed_resource)
|
|
110
|
+
return deployed_resource
|
|
111
|
+
|
|
112
|
+
log.debug(f"{existing} exists, reusing.")
|
|
113
|
+
log.info(f"URL: {existing.url}")
|
|
114
|
+
return existing
|
|
115
|
+
|
|
116
|
+
# No existing resource, deploy new one
|
|
117
|
+
log.debug(f"Deploying new resource: {uid}")
|
|
118
|
+
deployed_resource = await config.deploy()
|
|
76
119
|
log.info(f"URL: {deployed_resource.url}")
|
|
77
120
|
self.add_resource(uid, deployed_resource)
|
|
78
121
|
return deployed_resource
|
|
79
|
-
|
|
80
|
-
raise RuntimeError(f"Deployment failed for resource {uid}")
|
|
@@ -17,7 +17,6 @@ from ..utils.backoff import get_backoff_delay
|
|
|
17
17
|
from .base import DeployableResource
|
|
18
18
|
from .cloud import runpod
|
|
19
19
|
from .constants import CONSOLE_URL
|
|
20
|
-
from .cpu import CpuInstanceType
|
|
21
20
|
from .environment import EnvironmentVars
|
|
22
21
|
from .gpu import GpuGroup
|
|
23
22
|
from .network_volume import NetworkVolume, DataCenter
|
|
@@ -86,7 +85,6 @@ class ServerlessResource(DeployableResource):
|
|
|
86
85
|
executionTimeoutMs: Optional[int] = None
|
|
87
86
|
gpuCount: Optional[int] = 1
|
|
88
87
|
idleTimeout: Optional[int] = 5
|
|
89
|
-
instanceIds: Optional[List[CpuInstanceType]] = None
|
|
90
88
|
locations: Optional[str] = None
|
|
91
89
|
name: str
|
|
92
90
|
networkVolumeId: Optional[str] = None
|
|
@@ -134,15 +132,6 @@ class ServerlessResource(DeployableResource):
|
|
|
134
132
|
"""Convert ServerlessScalerType enum to string."""
|
|
135
133
|
return value.value if value is not None else None
|
|
136
134
|
|
|
137
|
-
@field_serializer("instanceIds")
|
|
138
|
-
def serialize_instance_ids(
|
|
139
|
-
self, value: Optional[List[CpuInstanceType]]
|
|
140
|
-
) -> Optional[List[str]]:
|
|
141
|
-
"""Convert CpuInstanceType enums to strings."""
|
|
142
|
-
if value is None:
|
|
143
|
-
return None
|
|
144
|
-
return [item.value if hasattr(item, "value") else str(item) for item in value]
|
|
145
|
-
|
|
146
135
|
@field_validator("gpus")
|
|
147
136
|
@classmethod
|
|
148
137
|
def validate_gpus(cls, value: List[GpuGroup]) -> List[GpuGroup]:
|
|
@@ -172,10 +161,9 @@ class ServerlessResource(DeployableResource):
|
|
|
172
161
|
# Volume already exists, use its ID
|
|
173
162
|
self.networkVolumeId = self.networkVolume.id
|
|
174
163
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
return self._sync_input_fields_gpu()
|
|
164
|
+
self._sync_input_fields_gpu()
|
|
165
|
+
|
|
166
|
+
return self
|
|
179
167
|
|
|
180
168
|
def _sync_input_fields_gpu(self):
|
|
181
169
|
# GPU-specific fields
|
|
@@ -199,14 +187,6 @@ class ServerlessResource(DeployableResource):
|
|
|
199
187
|
|
|
200
188
|
return self
|
|
201
189
|
|
|
202
|
-
def _sync_input_fields_cpu(self):
|
|
203
|
-
# Override GPU-specific fields for CPU
|
|
204
|
-
self.gpuCount = 0
|
|
205
|
-
self.allowedCudaVersions = ""
|
|
206
|
-
self.gpuIds = ""
|
|
207
|
-
|
|
208
|
-
return self
|
|
209
|
-
|
|
210
190
|
async def _ensure_network_volume_deployed(self) -> None:
|
|
211
191
|
"""
|
|
212
192
|
Ensures network volume is deployed and ready if one is specified.
|
|
@@ -274,7 +254,7 @@ class ServerlessResource(DeployableResource):
|
|
|
274
254
|
)
|
|
275
255
|
|
|
276
256
|
try:
|
|
277
|
-
# log.debug(f"[{
|
|
257
|
+
# log.debug(f"[{self}] Payload: {payload}")
|
|
278
258
|
|
|
279
259
|
log.info(f"{self} | API /run_sync")
|
|
280
260
|
response = await asyncio.to_thread(_fetch_job)
|
|
@@ -355,6 +335,26 @@ class ServerlessEndpoint(ServerlessResource):
|
|
|
355
335
|
Inherits from ServerlessResource.
|
|
356
336
|
"""
|
|
357
337
|
|
|
338
|
+
def _create_new_template(self) -> PodTemplate:
|
|
339
|
+
"""Create a new PodTemplate with standard configuration."""
|
|
340
|
+
return PodTemplate(
|
|
341
|
+
name=self.resource_id,
|
|
342
|
+
imageName=self.imageName,
|
|
343
|
+
env=KeyValuePair.from_dict(self.env or get_env_vars()),
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
def _configure_existing_template(self) -> None:
|
|
347
|
+
"""Configure an existing template with necessary overrides."""
|
|
348
|
+
if self.template is None:
|
|
349
|
+
return
|
|
350
|
+
|
|
351
|
+
self.template.name = f"{self.resource_id}__{self.template.resource_id}"
|
|
352
|
+
|
|
353
|
+
if self.imageName:
|
|
354
|
+
self.template.imageName = self.imageName
|
|
355
|
+
if self.env:
|
|
356
|
+
self.template.env = KeyValuePair.from_dict(self.env)
|
|
357
|
+
|
|
358
358
|
@model_validator(mode="after")
|
|
359
359
|
def set_serverless_template(self):
|
|
360
360
|
if not any([self.imageName, self.template, self.templateId]):
|
|
@@ -363,32 +363,13 @@ class ServerlessEndpoint(ServerlessResource):
|
|
|
363
363
|
)
|
|
364
364
|
|
|
365
365
|
if not self.templateId and not self.template:
|
|
366
|
-
self.template =
|
|
367
|
-
name=self.resource_id,
|
|
368
|
-
imageName=self.imageName,
|
|
369
|
-
env=KeyValuePair.from_dict(self.env or get_env_vars()),
|
|
370
|
-
)
|
|
371
|
-
|
|
366
|
+
self.template = self._create_new_template()
|
|
372
367
|
elif self.template:
|
|
373
|
-
self.
|
|
374
|
-
if self.imageName:
|
|
375
|
-
self.template.imageName = self.imageName
|
|
376
|
-
if self.env:
|
|
377
|
-
self.template.env = KeyValuePair.from_dict(self.env)
|
|
368
|
+
self._configure_existing_template()
|
|
378
369
|
|
|
379
370
|
return self
|
|
380
371
|
|
|
381
372
|
|
|
382
|
-
class CpuServerlessEndpoint(ServerlessEndpoint):
|
|
383
|
-
"""
|
|
384
|
-
Convenience class for CPU serverless endpoint.
|
|
385
|
-
Represents a CPU-only serverless endpoint distinct from a live serverless.
|
|
386
|
-
Inherits from ServerlessEndpoint.
|
|
387
|
-
"""
|
|
388
|
-
|
|
389
|
-
instanceIds: Optional[List[CpuInstanceType]] = [CpuInstanceType.CPU3G_2_8]
|
|
390
|
-
|
|
391
|
-
|
|
392
373
|
class JobOutput(BaseModel):
|
|
393
374
|
id: str
|
|
394
375
|
workerId: str
|
|
@@ -398,7 +379,7 @@ class JobOutput(BaseModel):
|
|
|
398
379
|
output: Optional[Any] = None
|
|
399
380
|
error: Optional[str] = ""
|
|
400
381
|
|
|
401
|
-
def model_post_init(self,
|
|
382
|
+
def model_post_init(self, _: Any) -> None:
|
|
402
383
|
log_group = f"Worker:{self.workerId}"
|
|
403
384
|
log.info(f"{log_group} | Delay Time: {self.delayTime} ms")
|
|
404
385
|
log.info(f"{log_group} | Execution Time: {self.executionTime} ms")
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CPU-specific serverless endpoint classes.
|
|
3
|
+
|
|
4
|
+
This module contains all CPU-related serverless functionality, separate from GPU serverless.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
from pydantic import field_serializer, model_validator
|
|
10
|
+
|
|
11
|
+
from .cpu import (
|
|
12
|
+
CpuInstanceType,
|
|
13
|
+
CPU_INSTANCE_DISK_LIMITS,
|
|
14
|
+
get_max_disk_size_for_instances,
|
|
15
|
+
)
|
|
16
|
+
from .serverless import ServerlessEndpoint, get_env_vars
|
|
17
|
+
from .template import KeyValuePair, PodTemplate
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CpuEndpointMixin:
|
|
21
|
+
"""Mixin class that provides CPU-specific functionality for serverless endpoints."""
|
|
22
|
+
|
|
23
|
+
instanceIds: Optional[List[CpuInstanceType]]
|
|
24
|
+
|
|
25
|
+
def _is_cpu_endpoint(self) -> bool:
|
|
26
|
+
"""Check if this is a CPU endpoint (has instanceIds)."""
|
|
27
|
+
return (
|
|
28
|
+
hasattr(self, "instanceIds")
|
|
29
|
+
and self.instanceIds is not None
|
|
30
|
+
and len(self.instanceIds) > 0
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def _get_cpu_container_disk_size(self) -> Optional[int]:
|
|
34
|
+
"""Get the appropriate container disk size for CPU instances."""
|
|
35
|
+
if not self._is_cpu_endpoint():
|
|
36
|
+
return None
|
|
37
|
+
return get_max_disk_size_for_instances(self.instanceIds)
|
|
38
|
+
|
|
39
|
+
def _apply_cpu_disk_sizing(self, template: PodTemplate) -> None:
|
|
40
|
+
"""Apply CPU disk sizing to a template if it's using the default size."""
|
|
41
|
+
if not self._is_cpu_endpoint():
|
|
42
|
+
return
|
|
43
|
+
|
|
44
|
+
# Only auto-size if template is using the default value
|
|
45
|
+
default_disk_size = PodTemplate.model_fields["containerDiskInGb"].default
|
|
46
|
+
if template.containerDiskInGb == default_disk_size:
|
|
47
|
+
cpu_disk_size = self._get_cpu_container_disk_size()
|
|
48
|
+
if cpu_disk_size is not None:
|
|
49
|
+
template.containerDiskInGb = cpu_disk_size
|
|
50
|
+
|
|
51
|
+
def validate_cpu_container_disk_size(self) -> None:
|
|
52
|
+
"""
|
|
53
|
+
Validate that container disk size doesn't exceed limits for CPU instances.
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ValueError: If container disk size exceeds the limit for any CPU instance
|
|
57
|
+
"""
|
|
58
|
+
if (
|
|
59
|
+
not self._is_cpu_endpoint()
|
|
60
|
+
or not hasattr(self, "template")
|
|
61
|
+
or not self.template
|
|
62
|
+
or not self.template.containerDiskInGb
|
|
63
|
+
):
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
max_allowed_disk_size = self._get_cpu_container_disk_size()
|
|
67
|
+
if max_allowed_disk_size is None:
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
if self.template.containerDiskInGb > max_allowed_disk_size:
|
|
71
|
+
instance_limits = []
|
|
72
|
+
for instance_type in self.instanceIds:
|
|
73
|
+
limit = CPU_INSTANCE_DISK_LIMITS[instance_type]
|
|
74
|
+
instance_limits.append(f"{instance_type.value}: max {limit}GB")
|
|
75
|
+
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"Container disk size {self.template.containerDiskInGb}GB exceeds the maximum "
|
|
78
|
+
f"allowed for CPU instances. Instance limits: {', '.join(instance_limits)}. "
|
|
79
|
+
f"Maximum allowed: {max_allowed_disk_size}GB"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def _sync_cpu_fields(self):
|
|
83
|
+
"""Sync CPU-specific fields, overriding GPU defaults."""
|
|
84
|
+
# Override GPU-specific fields for CPU
|
|
85
|
+
if hasattr(self, "gpuCount"):
|
|
86
|
+
self.gpuCount = 0
|
|
87
|
+
if hasattr(self, "allowedCudaVersions"):
|
|
88
|
+
self.allowedCudaVersions = ""
|
|
89
|
+
if hasattr(self, "gpuIds"):
|
|
90
|
+
self.gpuIds = ""
|
|
91
|
+
|
|
92
|
+
@field_serializer("instanceIds")
|
|
93
|
+
def serialize_instance_ids(
|
|
94
|
+
self, value: Optional[List[CpuInstanceType]]
|
|
95
|
+
) -> Optional[List[str]]:
|
|
96
|
+
"""Convert CpuInstanceType enums to strings."""
|
|
97
|
+
if value is None:
|
|
98
|
+
return None
|
|
99
|
+
return [item.value if hasattr(item, "value") else str(item) for item in value]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class CpuServerlessEndpoint(CpuEndpointMixin, ServerlessEndpoint):
|
|
103
|
+
"""
|
|
104
|
+
CPU-only serverless endpoint with automatic disk sizing and validation.
|
|
105
|
+
Represents a CPU-only serverless endpoint distinct from a live serverless.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
instanceIds: Optional[List[CpuInstanceType]] = [CpuInstanceType.CPU3G_2_8]
|
|
109
|
+
|
|
110
|
+
def _create_new_template(self) -> PodTemplate:
|
|
111
|
+
"""Create a new PodTemplate with CPU-appropriate disk sizing."""
|
|
112
|
+
template = PodTemplate(
|
|
113
|
+
name=self.resource_id,
|
|
114
|
+
imageName=self.imageName,
|
|
115
|
+
env=KeyValuePair.from_dict(self.env or get_env_vars()),
|
|
116
|
+
)
|
|
117
|
+
# Apply CPU-specific disk sizing
|
|
118
|
+
self._apply_cpu_disk_sizing(template)
|
|
119
|
+
return template
|
|
120
|
+
|
|
121
|
+
def _configure_existing_template(self) -> None:
|
|
122
|
+
"""Configure an existing template with necessary overrides and CPU sizing."""
|
|
123
|
+
if self.template is None:
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
self.template.name = f"{self.resource_id}__{self.template.resource_id}"
|
|
127
|
+
|
|
128
|
+
if self.imageName:
|
|
129
|
+
self.template.imageName = self.imageName
|
|
130
|
+
if self.env:
|
|
131
|
+
self.template.env = KeyValuePair.from_dict(self.env)
|
|
132
|
+
|
|
133
|
+
# Apply CPU-specific disk sizing
|
|
134
|
+
self._apply_cpu_disk_sizing(self.template)
|
|
135
|
+
|
|
136
|
+
@model_validator(mode="after")
|
|
137
|
+
def set_serverless_template(self):
|
|
138
|
+
# Sync CPU-specific fields first
|
|
139
|
+
self._sync_cpu_fields()
|
|
140
|
+
|
|
141
|
+
if not any([self.imageName, self.template, self.templateId]):
|
|
142
|
+
raise ValueError(
|
|
143
|
+
"Either imageName, template, or templateId must be provided"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
if not self.templateId and not self.template:
|
|
147
|
+
self.template = self._create_new_template()
|
|
148
|
+
elif self.template:
|
|
149
|
+
self._configure_existing_template()
|
|
150
|
+
|
|
151
|
+
# Validate container disk size for CPU instances
|
|
152
|
+
self.validate_cpu_container_disk_size()
|
|
153
|
+
|
|
154
|
+
return self
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cross-platform file locking utilities.
|
|
3
|
+
|
|
4
|
+
Provides unified file locking interface that works across Windows, macOS, and Linux.
|
|
5
|
+
Uses platform-appropriate locking mechanisms:
|
|
6
|
+
- Windows: msvcrt.locking()
|
|
7
|
+
- Unix/Linux/macOS: fcntl.flock()
|
|
8
|
+
- Fallback: Basic file existence checking (limited protection)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import contextlib
|
|
12
|
+
import logging
|
|
13
|
+
import platform
|
|
14
|
+
import time
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import BinaryIO, Optional
|
|
17
|
+
|
|
18
|
+
log = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# Platform detection
|
|
21
|
+
_IS_WINDOWS = platform.system() == "Windows"
|
|
22
|
+
_IS_UNIX = platform.system() in ("Linux", "Darwin")
|
|
23
|
+
|
|
24
|
+
# Initialize availability flags
|
|
25
|
+
_WINDOWS_LOCKING_AVAILABLE = False
|
|
26
|
+
_UNIX_LOCKING_AVAILABLE = False
|
|
27
|
+
|
|
28
|
+
# Import platform-specific modules
|
|
29
|
+
if _IS_WINDOWS:
|
|
30
|
+
try:
|
|
31
|
+
import msvcrt
|
|
32
|
+
|
|
33
|
+
_WINDOWS_LOCKING_AVAILABLE = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
msvcrt = None
|
|
36
|
+
log.warning("msvcrt not available on Windows platform")
|
|
37
|
+
|
|
38
|
+
if _IS_UNIX:
|
|
39
|
+
try:
|
|
40
|
+
import fcntl
|
|
41
|
+
|
|
42
|
+
_UNIX_LOCKING_AVAILABLE = True
|
|
43
|
+
except ImportError:
|
|
44
|
+
fcntl = None
|
|
45
|
+
log.warning("fcntl not available on Unix platform")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class FileLockError(Exception):
|
|
49
|
+
"""Exception raised when file locking operations fail."""
|
|
50
|
+
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class FileLockTimeout(FileLockError):
|
|
55
|
+
"""Exception raised when file locking times out."""
|
|
56
|
+
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@contextlib.contextmanager
|
|
61
|
+
def file_lock(
|
|
62
|
+
file_handle: BinaryIO,
|
|
63
|
+
exclusive: bool = True,
|
|
64
|
+
timeout: Optional[float] = 10.0,
|
|
65
|
+
retry_interval: float = 0.1,
|
|
66
|
+
):
|
|
67
|
+
"""
|
|
68
|
+
Cross-platform file locking context manager.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
file_handle: Open file handle to lock
|
|
72
|
+
exclusive: True for exclusive lock, False for shared lock
|
|
73
|
+
timeout: Maximum seconds to wait for lock (None = no timeout)
|
|
74
|
+
retry_interval: Seconds to wait between lock attempts
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
FileLockTimeout: If lock cannot be acquired within timeout
|
|
78
|
+
FileLockError: If locking operation fails
|
|
79
|
+
|
|
80
|
+
Usage:
|
|
81
|
+
with open("file.dat", "rb") as f:
|
|
82
|
+
with file_lock(f, exclusive=False): # Shared read lock
|
|
83
|
+
data = f.read()
|
|
84
|
+
|
|
85
|
+
with open("file.dat", "wb") as f:
|
|
86
|
+
with file_lock(f, exclusive=True): # Exclusive write lock
|
|
87
|
+
f.write(data)
|
|
88
|
+
"""
|
|
89
|
+
lock_acquired = False
|
|
90
|
+
start_time = time.time()
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
# Platform-specific locking
|
|
94
|
+
while not lock_acquired:
|
|
95
|
+
try:
|
|
96
|
+
if _IS_WINDOWS and _WINDOWS_LOCKING_AVAILABLE:
|
|
97
|
+
_acquire_windows_lock(file_handle, exclusive)
|
|
98
|
+
elif _IS_UNIX and _UNIX_LOCKING_AVAILABLE:
|
|
99
|
+
_acquire_unix_lock(file_handle, exclusive)
|
|
100
|
+
else:
|
|
101
|
+
# Fallback - limited protection via file existence
|
|
102
|
+
_acquire_fallback_lock(file_handle, exclusive, timeout)
|
|
103
|
+
|
|
104
|
+
lock_acquired = True
|
|
105
|
+
log.debug(f"File lock acquired (exclusive={exclusive})")
|
|
106
|
+
|
|
107
|
+
except (OSError, IOError, FileLockError) as e:
|
|
108
|
+
# Check timeout
|
|
109
|
+
if timeout is not None and (time.time() - start_time) >= timeout:
|
|
110
|
+
raise FileLockTimeout(
|
|
111
|
+
f"Could not acquire file lock within {timeout} seconds: {e}"
|
|
112
|
+
) from e
|
|
113
|
+
|
|
114
|
+
# Retry after interval
|
|
115
|
+
time.sleep(retry_interval)
|
|
116
|
+
|
|
117
|
+
# Lock acquired successfully
|
|
118
|
+
yield
|
|
119
|
+
|
|
120
|
+
finally:
|
|
121
|
+
# Release lock
|
|
122
|
+
if lock_acquired:
|
|
123
|
+
try:
|
|
124
|
+
if _IS_WINDOWS and _WINDOWS_LOCKING_AVAILABLE:
|
|
125
|
+
_release_windows_lock(file_handle)
|
|
126
|
+
elif _IS_UNIX and _UNIX_LOCKING_AVAILABLE:
|
|
127
|
+
_release_unix_lock(file_handle)
|
|
128
|
+
else:
|
|
129
|
+
_release_fallback_lock(file_handle)
|
|
130
|
+
|
|
131
|
+
log.debug("File lock released")
|
|
132
|
+
|
|
133
|
+
except Exception as e:
|
|
134
|
+
log.error(f"Error releasing file lock: {e}")
|
|
135
|
+
# Don't raise - we're in cleanup
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _acquire_windows_lock(file_handle: BinaryIO, exclusive: bool) -> None:
|
|
139
|
+
"""Acquire Windows file lock using msvcrt.locking()."""
|
|
140
|
+
if not _WINDOWS_LOCKING_AVAILABLE:
|
|
141
|
+
raise FileLockError("Windows file locking not available (msvcrt missing)")
|
|
142
|
+
|
|
143
|
+
# Windows locking modes
|
|
144
|
+
if exclusive:
|
|
145
|
+
lock_mode = msvcrt.LK_NBLCK # Non-blocking exclusive lock
|
|
146
|
+
else:
|
|
147
|
+
# Windows doesn't have shared locks in msvcrt
|
|
148
|
+
# Fall back to exclusive for compatibility
|
|
149
|
+
lock_mode = msvcrt.LK_NBLCK
|
|
150
|
+
log.debug("Windows: Using exclusive lock instead of shared (msvcrt limitation)")
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
# Lock the entire file (position 0, length 1)
|
|
154
|
+
file_handle.seek(0)
|
|
155
|
+
msvcrt.locking(file_handle.fileno(), lock_mode, 1)
|
|
156
|
+
except OSError as e:
|
|
157
|
+
raise FileLockError(f"Failed to acquire Windows file lock: {e}") from e
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _release_windows_lock(file_handle: BinaryIO) -> None:
|
|
161
|
+
"""Release Windows file lock."""
|
|
162
|
+
if not _WINDOWS_LOCKING_AVAILABLE:
|
|
163
|
+
return
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
file_handle.seek(0)
|
|
167
|
+
msvcrt.locking(file_handle.fileno(), msvcrt.LK_UNLCK, 1)
|
|
168
|
+
except OSError as e:
|
|
169
|
+
raise FileLockError(f"Failed to release Windows file lock: {e}") from e
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _acquire_unix_lock(file_handle: BinaryIO, exclusive: bool) -> None:
|
|
173
|
+
"""Acquire Unix file lock using fcntl.flock()."""
|
|
174
|
+
if not _UNIX_LOCKING_AVAILABLE:
|
|
175
|
+
raise FileLockError("Unix file locking not available (fcntl missing)")
|
|
176
|
+
|
|
177
|
+
# Unix locking modes
|
|
178
|
+
if exclusive:
|
|
179
|
+
lock_mode = fcntl.LOCK_EX | fcntl.LOCK_NB # Non-blocking exclusive
|
|
180
|
+
else:
|
|
181
|
+
lock_mode = fcntl.LOCK_SH | fcntl.LOCK_NB # Non-blocking shared
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
fcntl.flock(file_handle.fileno(), lock_mode)
|
|
185
|
+
except (OSError, IOError) as e:
|
|
186
|
+
raise FileLockError(f"Failed to acquire Unix file lock: {e}") from e
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _release_unix_lock(file_handle: BinaryIO) -> None:
|
|
190
|
+
"""Release Unix file lock."""
|
|
191
|
+
if not _UNIX_LOCKING_AVAILABLE:
|
|
192
|
+
return
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
fcntl.flock(file_handle.fileno(), fcntl.LOCK_UN)
|
|
196
|
+
except (OSError, IOError) as e:
|
|
197
|
+
raise FileLockError(f"Failed to release Unix file lock: {e}") from e
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _acquire_fallback_lock(
|
|
201
|
+
file_handle: BinaryIO, exclusive: bool, timeout: Optional[float]
|
|
202
|
+
) -> None:
|
|
203
|
+
"""
|
|
204
|
+
Fallback locking using lock files.
|
|
205
|
+
|
|
206
|
+
This provides minimal protection but doesn't prevent all race conditions.
|
|
207
|
+
It's better than no locking but not as robust as OS-level file locks.
|
|
208
|
+
"""
|
|
209
|
+
log.warning(
|
|
210
|
+
"Using fallback file locking - limited protection against race conditions"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Create lock file based on the original file
|
|
214
|
+
file_path = (
|
|
215
|
+
Path(file_handle.name) if hasattr(file_handle, "name") else Path("unknown")
|
|
216
|
+
)
|
|
217
|
+
lock_file = file_path.with_suffix(file_path.suffix + ".lock")
|
|
218
|
+
|
|
219
|
+
start_time = time.time()
|
|
220
|
+
|
|
221
|
+
while True:
|
|
222
|
+
try:
|
|
223
|
+
# Try to create lock file atomically
|
|
224
|
+
lock_file.touch(mode=0o600, exist_ok=False)
|
|
225
|
+
log.debug(f"Fallback lock file created: {lock_file}")
|
|
226
|
+
return
|
|
227
|
+
|
|
228
|
+
except FileExistsError:
|
|
229
|
+
# Lock file exists, check timeout
|
|
230
|
+
if timeout is not None and (time.time() - start_time) >= timeout:
|
|
231
|
+
raise FileLockError(f"Fallback lock timeout: {lock_file} exists")
|
|
232
|
+
|
|
233
|
+
# Wait and retry
|
|
234
|
+
time.sleep(0.1)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _release_fallback_lock(file_handle: BinaryIO) -> None:
|
|
238
|
+
"""Release fallback lock by removing lock file."""
|
|
239
|
+
try:
|
|
240
|
+
file_path = (
|
|
241
|
+
Path(file_handle.name) if hasattr(file_handle, "name") else Path("unknown")
|
|
242
|
+
)
|
|
243
|
+
lock_file = file_path.with_suffix(file_path.suffix + ".lock")
|
|
244
|
+
|
|
245
|
+
if lock_file.exists():
|
|
246
|
+
lock_file.unlink()
|
|
247
|
+
log.debug(f"Fallback lock file removed: {lock_file}")
|
|
248
|
+
|
|
249
|
+
except Exception as e:
|
|
250
|
+
log.error(f"Failed to remove fallback lock file: {e}")
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def get_platform_info() -> dict:
|
|
254
|
+
"""Get information about current platform and available locking mechanisms."""
|
|
255
|
+
return {
|
|
256
|
+
"platform": platform.system(),
|
|
257
|
+
"windows_locking": _IS_WINDOWS and _WINDOWS_LOCKING_AVAILABLE,
|
|
258
|
+
"unix_locking": _IS_UNIX and _UNIX_LOCKING_AVAILABLE,
|
|
259
|
+
"fallback_only": not (_WINDOWS_LOCKING_AVAILABLE or _UNIX_LOCKING_AVAILABLE),
|
|
260
|
+
}
|
tetra_rp/core/utils/singleton.py
CHANGED
|
@@ -1,7 +1,21 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
|
|
3
|
+
|
|
1
4
|
class SingletonMixin:
|
|
5
|
+
"""Thread-safe singleton mixin class.
|
|
6
|
+
|
|
7
|
+
Uses threading.Lock to ensure only one instance is created
|
|
8
|
+
per class, even under concurrent access.
|
|
9
|
+
"""
|
|
10
|
+
|
|
2
11
|
_instances = {}
|
|
12
|
+
_lock = threading.Lock()
|
|
3
13
|
|
|
4
14
|
def __new__(cls, *args, **kwargs):
|
|
15
|
+
# Use double-checked locking pattern for performance
|
|
5
16
|
if cls not in cls._instances:
|
|
6
|
-
cls.
|
|
17
|
+
with cls._lock:
|
|
18
|
+
# Check again inside the lock (double-checked locking)
|
|
19
|
+
if cls not in cls._instances:
|
|
20
|
+
cls._instances[cls] = super().__new__(cls)
|
|
7
21
|
return cls._instances[cls]
|
|
@@ -4,6 +4,7 @@ import inspect
|
|
|
4
4
|
import textwrap
|
|
5
5
|
import hashlib
|
|
6
6
|
import traceback
|
|
7
|
+
import threading
|
|
7
8
|
import cloudpickle
|
|
8
9
|
import logging
|
|
9
10
|
from ..core.resources import LiveServerless
|
|
@@ -16,8 +17,9 @@ from ..protos.remote_execution import (
|
|
|
16
17
|
log = logging.getLogger(__name__)
|
|
17
18
|
|
|
18
19
|
|
|
19
|
-
#
|
|
20
|
+
# Global in-memory cache with thread safety
|
|
20
21
|
_SERIALIZED_FUNCTION_CACHE = {}
|
|
22
|
+
_function_cache_lock = threading.RLock()
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
def get_function_source(func):
|
|
@@ -80,12 +82,14 @@ class LiveServerlessStub(RemoteExecutorStub):
|
|
|
80
82
|
"hf_models_to_cache": hf_models_to_cache,
|
|
81
83
|
}
|
|
82
84
|
|
|
83
|
-
#
|
|
84
|
-
|
|
85
|
-
#
|
|
86
|
-
|
|
85
|
+
# Thread-safe cache access
|
|
86
|
+
with _function_cache_lock:
|
|
87
|
+
# check if the function is already cached
|
|
88
|
+
if src_hash not in _SERIALIZED_FUNCTION_CACHE:
|
|
89
|
+
# Cache the serialized function
|
|
90
|
+
_SERIALIZED_FUNCTION_CACHE[src_hash] = source
|
|
87
91
|
|
|
88
|
-
|
|
92
|
+
request["function_code"] = _SERIALIZED_FUNCTION_CACHE[src_hash]
|
|
89
93
|
|
|
90
94
|
# Serialize arguments using cloudpickle
|
|
91
95
|
if args:
|
tetra_rp/stubs/registry.py
CHANGED
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
from functools import singledispatch
|
|
3
3
|
|
|
4
4
|
from ..core.resources import (
|
|
5
|
+
CpuLiveServerless,
|
|
5
6
|
CpuServerlessEndpoint,
|
|
6
7
|
LiveServerless,
|
|
7
8
|
ServerlessEndpoint,
|
|
@@ -20,8 +21,8 @@ def stub_resource(resource, **extra):
|
|
|
20
21
|
return fallback
|
|
21
22
|
|
|
22
23
|
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
def _create_live_serverless_stub(resource, **extra):
|
|
25
|
+
"""Create a live serverless stub for both LiveServerless and CpuLiveServerless."""
|
|
25
26
|
stub = LiveServerlessStub(resource)
|
|
26
27
|
|
|
27
28
|
# Function execution
|
|
@@ -60,15 +61,27 @@ def _(resource, **extra):
|
|
|
60
61
|
return stubbed_resource
|
|
61
62
|
|
|
62
63
|
|
|
64
|
+
@stub_resource.register(LiveServerless)
|
|
65
|
+
def _(resource, **extra):
|
|
66
|
+
return _create_live_serverless_stub(resource, **extra)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@stub_resource.register(CpuLiveServerless)
|
|
70
|
+
def _(resource, **extra):
|
|
71
|
+
return _create_live_serverless_stub(resource, **extra)
|
|
72
|
+
|
|
73
|
+
|
|
63
74
|
@stub_resource.register(ServerlessEndpoint)
|
|
64
75
|
def _(resource, **extra):
|
|
65
76
|
async def stubbed_resource(
|
|
66
|
-
func,
|
|
77
|
+
func,
|
|
78
|
+
dependencies,
|
|
79
|
+
system_dependencies,
|
|
80
|
+
accelerate_downloads,
|
|
81
|
+
hf_models_to_cache,
|
|
82
|
+
*args,
|
|
83
|
+
**kwargs,
|
|
67
84
|
) -> dict:
|
|
68
|
-
if args == (None,):
|
|
69
|
-
# cleanup: when the function is called with no args
|
|
70
|
-
args = []
|
|
71
|
-
|
|
72
85
|
if dependencies or system_dependencies:
|
|
73
86
|
log.warning(
|
|
74
87
|
"Dependencies are not supported for ServerlessEndpoint. "
|
|
@@ -86,12 +99,14 @@ def _(resource, **extra):
|
|
|
86
99
|
@stub_resource.register(CpuServerlessEndpoint)
|
|
87
100
|
def _(resource, **extra):
|
|
88
101
|
async def stubbed_resource(
|
|
89
|
-
func,
|
|
102
|
+
func,
|
|
103
|
+
dependencies,
|
|
104
|
+
system_dependencies,
|
|
105
|
+
accelerate_downloads,
|
|
106
|
+
hf_models_to_cache,
|
|
107
|
+
*args,
|
|
108
|
+
**kwargs,
|
|
90
109
|
) -> dict:
|
|
91
|
-
if args == (None,):
|
|
92
|
-
# cleanup: when the function is called with no args
|
|
93
|
-
args = []
|
|
94
|
-
|
|
95
110
|
if dependencies or system_dependencies:
|
|
96
111
|
log.warning(
|
|
97
112
|
"Dependencies are not supported for CpuServerlessEndpoint. "
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tetra_rp
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.12.0
|
|
4
4
|
Summary: A Python library for distributed inference and serving of machine learning models
|
|
5
5
|
Author-email: Marut Pandya <pandyamarut@gmail.com>, Patrick Rachford <prachford@icloud.com>, Dean Quinanola <dean.quinanola@runpod.io>
|
|
6
6
|
License: MIT
|
|
@@ -1,36 +1,38 @@
|
|
|
1
|
-
tetra_rp/__init__.py,sha256=
|
|
1
|
+
tetra_rp/__init__.py,sha256=_D3Wbtv9tBh_WGS4Si5uQbsefL63GqqjoGTN3R8P6fA,769
|
|
2
2
|
tetra_rp/client.py,sha256=urSVh0j9didd9U8lboPv3TtFYURp2XO6ReOICr9Xrls,3414
|
|
3
3
|
tetra_rp/execute_class.py,sha256=jYNFalqqjKvvCz1zzodRvOkrLQd2FYnLYa4EElEYp8w,12243
|
|
4
4
|
tetra_rp/logger.py,sha256=gk5-PWp3k_GQ5DxndsRkBCX0jarp_3lgZ1oiTFuThQg,1125
|
|
5
5
|
tetra_rp/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
tetra_rp/core/api/__init__.py,sha256=oldrEKMwxYoBPLvPfVlaFS3wfUtTTxCN6-HzlpTh6vE,124
|
|
7
7
|
tetra_rp/core/api/runpod.py,sha256=3TTx1fkXMLZ2R5JCrQYPEn8dhdUsBt8i5OEwAfaKQ_k,10451
|
|
8
|
-
tetra_rp/core/resources/__init__.py,sha256=
|
|
8
|
+
tetra_rp/core/resources/__init__.py,sha256=ZvSfceYV4S1Xo3YA03B-sR5VRud8BKfx5Pe44xFfruo,911
|
|
9
9
|
tetra_rp/core/resources/base.py,sha256=UJeDiFN45aO1n5SBcxn56ohLhj-AWHoj0KO7mF4yJ_o,1440
|
|
10
10
|
tetra_rp/core/resources/cloud.py,sha256=XJOWPfzYlDVJGHxgffcfpEaOKrWhGdi7AzTlaGuYj0o,70
|
|
11
11
|
tetra_rp/core/resources/constants.py,sha256=F1gPqFaXcCmfrbUSO9PQtUBv984TxFc3pySgVy-kXk8,158
|
|
12
|
-
tetra_rp/core/resources/cpu.py,sha256=
|
|
12
|
+
tetra_rp/core/resources/cpu.py,sha256=szhkjaJ9OnjzLvaD7Yc2hRCR-C9eyB6tswo1Qe6VuZ0,3962
|
|
13
13
|
tetra_rp/core/resources/environment.py,sha256=FC9kJCa8YLSar75AKUKqJYnNLrUdjZj8ZTOrspBrS00,1267
|
|
14
|
-
tetra_rp/core/resources/gpu.py,sha256=
|
|
15
|
-
tetra_rp/core/resources/live_serverless.py,sha256=
|
|
14
|
+
tetra_rp/core/resources/gpu.py,sha256=mMOPLhBugFBAMAl3ezhjAxKuvYya5_9A_h7kvaCoAfk,1885
|
|
15
|
+
tetra_rp/core/resources/live_serverless.py,sha256=FLmaQdn5UMczEfkP3qykIfRVfZeyYdvyNHX9Nd13_54,1868
|
|
16
16
|
tetra_rp/core/resources/network_volume.py,sha256=h_1xhrbBm9jJWROOGl5qy9u4_kCKSyV4idzt0567-J8,5193
|
|
17
|
-
tetra_rp/core/resources/resource_manager.py,sha256=
|
|
18
|
-
tetra_rp/core/resources/serverless.py,sha256=
|
|
17
|
+
tetra_rp/core/resources/resource_manager.py,sha256=K-SgCk2BMNEAnkB87YynxUH-suZcdcOPLMonL7EogIw,4988
|
|
18
|
+
tetra_rp/core/resources/serverless.py,sha256=1T21RkMjGnM1I87AsGQ6qazp7A9cE7LwH_c6yJ5shPQ,13427
|
|
19
|
+
tetra_rp/core/resources/serverless_cpu.py,sha256=OiG1C_5_j7pYPHdGp3lQQanIl2ak81u8-jlZo2OXflA,5567
|
|
19
20
|
tetra_rp/core/resources/template.py,sha256=qQ8Wd7Rzr1_YeAbW1V7_k7AVHzgWR_RPjcaRfKsetAk,3141
|
|
20
21
|
tetra_rp/core/resources/utils.py,sha256=mgXfgz_NuHN_IC7TzMNdH9II-LMjxcDCG7syDTcPiGs,1721
|
|
21
22
|
tetra_rp/core/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
23
|
tetra_rp/core/utils/backoff.py,sha256=1pfa0smFNpib8nztcIgBbtrVvQeECKh-aNOfL2TztgU,1324
|
|
23
24
|
tetra_rp/core/utils/constants.py,sha256=Dm4XiO5zTzfdqOSeYVfAjaf2LyHnIEVmbOi_s_k1J_E,375
|
|
25
|
+
tetra_rp/core/utils/file_lock.py,sha256=bxtAexD2rbqMhdr94VbmKdNp0gfKRgxDXx1n7LX4Eso,8269
|
|
24
26
|
tetra_rp/core/utils/json.py,sha256=q0r7aEdfh8kKVeHGeh9fBDfuhHYNopSreislAMB6HhM,1163
|
|
25
27
|
tetra_rp/core/utils/lru_cache.py,sha256=drwKg-DfLbeBRGTzuxKqNKMQq0EuZV15LMTZIOyZuVk,2618
|
|
26
|
-
tetra_rp/core/utils/singleton.py,sha256=
|
|
28
|
+
tetra_rp/core/utils/singleton.py,sha256=lSXgEQGX9nzhrc05GMpThn9SHKG45iajBbSEtwCcNyI,632
|
|
27
29
|
tetra_rp/protos/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
30
|
tetra_rp/protos/remote_execution.py,sha256=flKJG0U4ked84cXyF4Gfs_7fBgLsEVOzBv8ZWB9UlP0,5648
|
|
29
31
|
tetra_rp/stubs/__init__.py,sha256=ozKsHs8q0T7o2qhQEquub9hqomh1Htys53mMraaRu2E,72
|
|
30
|
-
tetra_rp/stubs/live_serverless.py,sha256=
|
|
31
|
-
tetra_rp/stubs/registry.py,sha256=
|
|
32
|
+
tetra_rp/stubs/live_serverless.py,sha256=2bQ47Hq4D8o3YydkxR6QVuLjYrzYS427r3067beYK2A,4550
|
|
33
|
+
tetra_rp/stubs/registry.py,sha256=akJREObyvTAVp6Rvxw8O49W52eJAjByleLU_phMyaDI,3351
|
|
32
34
|
tetra_rp/stubs/serverless.py,sha256=BM_a5Ml5VADBYu2WRNmo9qnicP8NnXDGl5ywifulbD0,947
|
|
33
|
-
tetra_rp-0.
|
|
34
|
-
tetra_rp-0.
|
|
35
|
-
tetra_rp-0.
|
|
36
|
-
tetra_rp-0.
|
|
35
|
+
tetra_rp-0.12.0.dist-info/METADATA,sha256=u2OHbJV-8KIiOca3SenK2J6I8ms13P8Jc64KGNhFAyE,28077
|
|
36
|
+
tetra_rp-0.12.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
37
|
+
tetra_rp-0.12.0.dist-info/top_level.txt,sha256=bBay7JTDwJXsTYvVjrwno9hnF-j0q272lk65f2AcPjU,9
|
|
38
|
+
tetra_rp-0.12.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|