tetra-rp 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tetra_rp/__init__.py +37 -0
- tetra_rp/client.py +59 -0
- tetra_rp/core/__init__.py +0 -0
- tetra_rp/core/api/__init__.py +5 -0
- tetra_rp/core/api/runpod.py +212 -0
- tetra_rp/core/pool/__init__.py +0 -0
- tetra_rp/core/pool/cluster_manager.py +177 -0
- tetra_rp/core/pool/dataclass.py +18 -0
- tetra_rp/core/pool/ex.py +38 -0
- tetra_rp/core/pool/job.py +22 -0
- tetra_rp/core/pool/worker.py +19 -0
- tetra_rp/core/resources/__init__.py +33 -0
- tetra_rp/core/resources/base.py +47 -0
- tetra_rp/core/resources/cloud.py +4 -0
- tetra_rp/core/resources/cpu.py +34 -0
- tetra_rp/core/resources/environment.py +41 -0
- tetra_rp/core/resources/gpu.py +53 -0
- tetra_rp/core/resources/live_serverless.py +32 -0
- tetra_rp/core/resources/resource_manager.py +80 -0
- tetra_rp/core/resources/serverless.py +476 -0
- tetra_rp/core/resources/template.py +94 -0
- tetra_rp/core/resources/utils.py +50 -0
- tetra_rp/core/utils/__init__.py +0 -0
- tetra_rp/core/utils/backoff.py +43 -0
- tetra_rp/core/utils/json.py +33 -0
- tetra_rp/core/utils/singleton.py +7 -0
- tetra_rp/logger.py +34 -0
- tetra_rp/protos/__init__.py +0 -0
- tetra_rp/protos/remote_execution.py +57 -0
- tetra_rp/stubs/__init__.py +5 -0
- tetra_rp/stubs/live_serverless.py +133 -0
- tetra_rp/stubs/registry.py +85 -0
- tetra_rp/stubs/serverless.py +30 -0
- tetra_rp-0.5.5.dist-info/METADATA +806 -0
- tetra_rp-0.5.5.dist-info/RECORD +37 -0
- tetra_rp-0.5.5.dist-info/WHEEL +5 -0
- tetra_rp-0.5.5.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class CpuInstanceType(str, Enum):
|
|
5
|
+
"""Valid CPU instance types.
|
|
6
|
+
|
|
7
|
+
Format: {generation}{type}-{vcpu}-{memory_gb}
|
|
8
|
+
Based on Runpod backend validation logic:
|
|
9
|
+
- memoryInGb = vcpuCount * flavor.ramMultiplier
|
|
10
|
+
|
|
11
|
+
RAM Multipliers (DEV environment):
|
|
12
|
+
- cpu3g: 4.0 (1 vCPU = 4GB, 2 vCPU = 8GB, etc.)
|
|
13
|
+
- cpu3c: 2.0 (1 vCPU = 2GB, 2 vCPU = 4GB, etc.)
|
|
14
|
+
- cpu5c: 2.0 (1 vCPU = 2GB, 2 vCPU = 4GB, etc.)
|
|
15
|
+
- cpu5g: Not available
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
# 3rd Generation General Purpose (RAM multiplier: 4.0)
|
|
19
|
+
CPU3G_1_4 = "cpu3g-1-4" # 1 vCPU, 4GB RAM
|
|
20
|
+
CPU3G_2_8 = "cpu3g-2-8" # 2 vCPU, 8GB RAM
|
|
21
|
+
CPU3G_4_16 = "cpu3g-4-16" # 4 vCPU, 16GB RAM
|
|
22
|
+
CPU3G_8_32 = "cpu3g-8-32" # 8 vCPU, 32GB RAM
|
|
23
|
+
|
|
24
|
+
# 3rd Generation Compute-Optimized (RAM multiplier: 2.0)
|
|
25
|
+
CPU3C_1_2 = "cpu3c-1-2" # 1 vCPU, 2GB RAM
|
|
26
|
+
CPU3C_2_4 = "cpu3c-2-4" # 2 vCPU, 4GB RAM
|
|
27
|
+
CPU3C_4_8 = "cpu3c-4-8" # 4 vCPU, 8GB RAM
|
|
28
|
+
CPU3C_8_16 = "cpu3c-8-16" # 8 vCPU, 16GB RAM
|
|
29
|
+
|
|
30
|
+
# 5th Generation Compute-Optimized (RAM multiplier: 2.0)
|
|
31
|
+
CPU5C_1_2 = "cpu5c-1-2" # 1 vCPU, 2GB RAM
|
|
32
|
+
CPU5C_2_4 = "cpu5c-2-4" # 2 vCPU, 4GB RAM
|
|
33
|
+
CPU5C_4_8 = "cpu5c-4-8" # 4 vCPU, 8GB RAM
|
|
34
|
+
CPU5C_8_16 = "cpu5c-8-16" # 8 vCPU, 16GB RAM
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class EnvironmentVars:
|
|
6
|
+
def __init__(self):
|
|
7
|
+
# Store environment variables from .env file
|
|
8
|
+
self.env = self._load_env()
|
|
9
|
+
|
|
10
|
+
def _load_env(self) -> Dict[str, str]:
|
|
11
|
+
"""
|
|
12
|
+
Loads environment variables specifically from the .env file
|
|
13
|
+
and returns them as a dictionary.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Dict[str, str]: Dictionary containing environment variables from .env file
|
|
17
|
+
"""
|
|
18
|
+
# Use dotenv_values instead of load_dotenv to get only variables from .env
|
|
19
|
+
return dict(dotenv_values())
|
|
20
|
+
|
|
21
|
+
def get_env(self) -> Dict[str, str]:
|
|
22
|
+
"""
|
|
23
|
+
Returns the dictionary of environment variables.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Dict[str, str]: Dictionary containing environment variables
|
|
27
|
+
"""
|
|
28
|
+
return self.env
|
|
29
|
+
|
|
30
|
+
def get_value(self, key: str, default: str = None) -> Optional[str]:
|
|
31
|
+
"""
|
|
32
|
+
Gets a specific environment variable by key.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
key (str): The environment variable key
|
|
36
|
+
default (str, optional): Default value if key doesn't exist
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Optional[str]: Value of the environment variable or default
|
|
40
|
+
"""
|
|
41
|
+
return self.env.get(key, default)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from typing import Optional, List
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class GpuLowestPrice(BaseModel):
|
|
7
|
+
minimumBidPrice: Optional[float] = None
|
|
8
|
+
uninterruptablePrice: Optional[float] = None
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GpuType(BaseModel):
|
|
12
|
+
id: str
|
|
13
|
+
displayName: str
|
|
14
|
+
memoryInGb: int
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class GpuTypeDetail(GpuType):
|
|
18
|
+
communityCloud: Optional[bool] = None
|
|
19
|
+
communityPrice: Optional[float] = None
|
|
20
|
+
communitySpotPrice: Optional[float] = None
|
|
21
|
+
cudaCores: Optional[int] = None
|
|
22
|
+
lowestPrice: Optional[GpuLowestPrice] = None
|
|
23
|
+
manufacturer: Optional[str] = None
|
|
24
|
+
maxGpuCount: Optional[int] = None
|
|
25
|
+
oneMonthPrice: Optional[float] = None
|
|
26
|
+
oneWeekPrice: Optional[float] = None
|
|
27
|
+
secureCloud: Optional[bool] = None
|
|
28
|
+
securePrice: Optional[float] = None
|
|
29
|
+
secureSpotPrice: Optional[float] = None
|
|
30
|
+
threeMonthPrice: Optional[float] = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# TODO: this should be fetched from an API
|
|
34
|
+
class GpuGroup(Enum):
|
|
35
|
+
ANY = "any" # "Any GPU"
|
|
36
|
+
ADA_24 = "ADA_24" # "NVIDIA GeForce RTX 4090"
|
|
37
|
+
ADA_32_PRO = "ADA_32_PRO" # "NVIDIA GeForce RTX 5090"
|
|
38
|
+
ADA_48_PRO = (
|
|
39
|
+
"ADA_48_PRO" # "NVIDIA RTX 6000 Ada Generation, NVIDIA L40, NVIDIA L40S"
|
|
40
|
+
)
|
|
41
|
+
ADA_80_PRO = (
|
|
42
|
+
"ADA_80_PRO" # "NVIDIA H100 PCIe, NVIDIA H100 80GB HBM3, NVIDIA H100 NVL"
|
|
43
|
+
)
|
|
44
|
+
AMPERE_16 = "AMPERE_16" # "NVIDIA RTX A4000, NVIDIA RTX A4500, NVIDIA RTX 4000 Ada Generation, NVIDIA RTX 2000 Ada Generation"
|
|
45
|
+
AMPERE_24 = "AMPERE_24" # "NVIDIA RTX A5000, NVIDIA L4, NVIDIA GeForce RTX 3090"
|
|
46
|
+
AMPERE_48 = "AMPERE_48" # "NVIDIA A40, NVIDIA RTX A6000"
|
|
47
|
+
AMPERE_80 = "AMPERE_80" # "NVIDIA A100 80GB PCIe, NVIDIA A100-SXM4-80GB"
|
|
48
|
+
HOPPER_141 = "HOPPER_141" # "NVIDIA H200"
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def all(cls) -> List["GpuGroup"]:
|
|
52
|
+
"""Returns all GPU groups."""
|
|
53
|
+
return [cls.AMPERE_48] + [g for g in cls if g != cls.ANY]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Ship serverless code as you write it. No builds, no deploys — just run.
|
|
2
|
+
import os
|
|
3
|
+
from pydantic import model_validator
|
|
4
|
+
from .serverless import ServerlessEndpoint
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
TETRA_GPU_IMAGE = os.environ.get("TETRA_GPU_IMAGE", "runpod/tetra-rp:dev")
|
|
8
|
+
TETRA_CPU_IMAGE = os.environ.get("TETRA_CPU_IMAGE", "runpod/tetra-rp-cpu:dev")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LiveServerless(ServerlessEndpoint):
|
|
12
|
+
@model_validator(mode="before")
|
|
13
|
+
@classmethod
|
|
14
|
+
def set_live_serverless_template(cls, data: dict):
|
|
15
|
+
"""Set default templates for Live Serverless. This can't be changed."""
|
|
16
|
+
# Always set imageName based on instanceIds presence
|
|
17
|
+
data["imageName"] = (
|
|
18
|
+
TETRA_CPU_IMAGE if data.get("instanceIds") else TETRA_GPU_IMAGE
|
|
19
|
+
)
|
|
20
|
+
return data
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def imageName(self):
|
|
24
|
+
# Lock imageName to always reflect instanceIds
|
|
25
|
+
return (
|
|
26
|
+
TETRA_CPU_IMAGE if getattr(self, "instanceIds", None) else TETRA_GPU_IMAGE
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
@imageName.setter
|
|
30
|
+
def imageName(self, value):
|
|
31
|
+
# Prevent manual setting of imageName
|
|
32
|
+
pass
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import cloudpickle
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Dict
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from ..utils.singleton import SingletonMixin
|
|
7
|
+
|
|
8
|
+
from .base import DeployableResource
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
log = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
# File to persist state of resources
|
|
14
|
+
RESOURCE_STATE_FILE = Path(".tetra_resources.pkl")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ResourceManager(SingletonMixin):
|
|
18
|
+
"""Manages dynamic provisioning and tracking of remote resources."""
|
|
19
|
+
|
|
20
|
+
_resources: Dict[str, DeployableResource] = {}
|
|
21
|
+
|
|
22
|
+
def __init__(self):
|
|
23
|
+
if not self._resources:
|
|
24
|
+
self._load_resources()
|
|
25
|
+
|
|
26
|
+
def _load_resources(self) -> Dict[str, DeployableResource]:
|
|
27
|
+
"""Load persisted resource information using cloudpickle."""
|
|
28
|
+
if RESOURCE_STATE_FILE.exists():
|
|
29
|
+
try:
|
|
30
|
+
with open(RESOURCE_STATE_FILE, "rb") as f:
|
|
31
|
+
self._resources = cloudpickle.load(f)
|
|
32
|
+
log.debug(f"Loaded saved resources from {RESOURCE_STATE_FILE}")
|
|
33
|
+
except Exception as e:
|
|
34
|
+
log.error(f"Failed to load resources from {RESOURCE_STATE_FILE}: {e}")
|
|
35
|
+
return self._resources
|
|
36
|
+
|
|
37
|
+
def _save_resources(self) -> None:
|
|
38
|
+
"""Persist state of resources to disk using cloudpickle."""
|
|
39
|
+
with open(RESOURCE_STATE_FILE, "wb") as f:
|
|
40
|
+
cloudpickle.dump(self._resources, f)
|
|
41
|
+
log.debug(f"Saved resources in {RESOURCE_STATE_FILE}")
|
|
42
|
+
|
|
43
|
+
def add_resource(self, uid: str, resource: DeployableResource):
|
|
44
|
+
"""Add a resource to the manager."""
|
|
45
|
+
self._resources[uid] = resource
|
|
46
|
+
self._save_resources()
|
|
47
|
+
|
|
48
|
+
# function to check if resource still exists remotely, else remove it
|
|
49
|
+
def remove_resource(self, uid: str):
|
|
50
|
+
"""Remove a resource from the manager."""
|
|
51
|
+
if uid not in self._resources:
|
|
52
|
+
log.warning(f"Resource {uid} not found for removal")
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
del self._resources[uid]
|
|
56
|
+
log.debug(f"Removed resource {uid}")
|
|
57
|
+
|
|
58
|
+
self._save_resources()
|
|
59
|
+
|
|
60
|
+
async def get_or_deploy_resource(
|
|
61
|
+
self, config: DeployableResource
|
|
62
|
+
) -> DeployableResource:
|
|
63
|
+
"""Get existing or create new resource based on config."""
|
|
64
|
+
uid = config.resource_id
|
|
65
|
+
if existing := self._resources.get(uid):
|
|
66
|
+
if not existing.is_deployed():
|
|
67
|
+
log.warning(f"{existing} is no longer valid, redeploying.")
|
|
68
|
+
self.remove_resource(uid)
|
|
69
|
+
return await self.get_or_deploy_resource(config)
|
|
70
|
+
|
|
71
|
+
log.debug(f"{existing} exists, reusing.")
|
|
72
|
+
log.info(f"URL: {existing.url}")
|
|
73
|
+
return existing
|
|
74
|
+
|
|
75
|
+
if deployed_resource := await config.deploy():
|
|
76
|
+
log.info(f"URL: {deployed_resource.url}")
|
|
77
|
+
self.add_resource(uid, deployed_resource)
|
|
78
|
+
return deployed_resource
|
|
79
|
+
|
|
80
|
+
raise RuntimeError(f"Deployment failed for resource {uid}")
|