tetra-rp 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tetra_rp/core/api/runpod.py +24 -0
- tetra_rp/core/resources/live_serverless.py +7 -3
- tetra_rp/core/resources/network_volume.py +68 -23
- tetra_rp/core/resources/serverless.py +7 -66
- tetra_rp/core/utils/constants.py +10 -0
- tetra_rp/core/utils/lru_cache.py +75 -0
- tetra_rp/execute_class.py +162 -24
- {tetra_rp-0.8.0.dist-info → tetra_rp-0.10.0.dist-info}/METADATA +3 -3
- {tetra_rp-0.8.0.dist-info → tetra_rp-0.10.0.dist-info}/RECORD +11 -15
- tetra_rp/core/pool/__init__.py +0 -0
- tetra_rp/core/pool/cluster_manager.py +0 -177
- tetra_rp/core/pool/dataclass.py +0 -18
- tetra_rp/core/pool/ex.py +0 -38
- tetra_rp/core/pool/job.py +0 -22
- tetra_rp/core/pool/worker.py +0 -19
- {tetra_rp-0.8.0.dist-info → tetra_rp-0.10.0.dist-info}/WHEEL +0 -0
- {tetra_rp-0.8.0.dist-info → tetra_rp-0.10.0.dist-info}/top_level.txt +0 -0
tetra_rp/core/api/runpod.py
CHANGED
|
@@ -281,6 +281,30 @@ class RunpodRestClient:
|
|
|
281
281
|
|
|
282
282
|
return result
|
|
283
283
|
|
|
284
|
+
async def list_network_volumes(self) -> Dict[str, Any]:
|
|
285
|
+
"""
|
|
286
|
+
List all network volumes in Runpod.
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
List of network volume objects or dict containing networkVolumes key.
|
|
290
|
+
The API may return either format depending on version.
|
|
291
|
+
"""
|
|
292
|
+
log.debug("Listing network volumes")
|
|
293
|
+
|
|
294
|
+
result = await self._execute_rest(
|
|
295
|
+
"GET", f"{RUNPOD_REST_API_URL}/networkvolumes"
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Handle both list and dict responses
|
|
299
|
+
if isinstance(result, list):
|
|
300
|
+
volume_count = len(result)
|
|
301
|
+
else:
|
|
302
|
+
volume_count = len(result.get("networkVolumes", []))
|
|
303
|
+
|
|
304
|
+
log.debug(f"Listed {volume_count} network volumes")
|
|
305
|
+
|
|
306
|
+
return result
|
|
307
|
+
|
|
284
308
|
async def close(self):
|
|
285
309
|
"""Close the HTTP session."""
|
|
286
310
|
if self.session and not self.session.closed:
|
|
@@ -3,9 +3,13 @@ import os
|
|
|
3
3
|
from pydantic import model_validator
|
|
4
4
|
from .serverless import ServerlessEndpoint
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
TETRA_GPU_IMAGE = os.environ.get(
|
|
8
|
-
|
|
6
|
+
TETRA_IMAGE_TAG = os.environ.get("TETRA_IMAGE_TAG", "latest")
|
|
7
|
+
TETRA_GPU_IMAGE = os.environ.get(
|
|
8
|
+
"TETRA_GPU_IMAGE", f"runpod/tetra-rp:{TETRA_IMAGE_TAG}"
|
|
9
|
+
)
|
|
10
|
+
TETRA_CPU_IMAGE = os.environ.get(
|
|
11
|
+
"TETRA_CPU_IMAGE", f"runpod/tetra-rp-cpu:{TETRA_IMAGE_TAG}"
|
|
12
|
+
)
|
|
9
13
|
|
|
10
14
|
|
|
11
15
|
class LiveServerless(ServerlessEndpoint):
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import logging
|
|
2
3
|
from enum import Enum
|
|
3
4
|
from typing import Optional
|
|
@@ -25,10 +26,11 @@ class DataCenter(str, Enum):
|
|
|
25
26
|
|
|
26
27
|
class NetworkVolume(DeployableResource):
|
|
27
28
|
"""
|
|
28
|
-
NetworkVolume resource for creating and managing Runpod
|
|
29
|
+
NetworkVolume resource for creating and managing Runpod network volumes.
|
|
29
30
|
|
|
30
31
|
This class handles the creation, deployment, and management of network volumes
|
|
31
|
-
that can be attached to serverless resources.
|
|
32
|
+
that can be attached to serverless resources. Supports idempotent deployment
|
|
33
|
+
where multiple volumes with the same name will reuse existing volumes.
|
|
32
34
|
|
|
33
35
|
"""
|
|
34
36
|
|
|
@@ -37,11 +39,24 @@ class NetworkVolume(DeployableResource):
|
|
|
37
39
|
|
|
38
40
|
id: Optional[str] = Field(default=None)
|
|
39
41
|
name: Optional[str] = None
|
|
40
|
-
size: Optional[int] = Field(default=
|
|
42
|
+
size: Optional[int] = Field(default=50, gt=0) # Size in GB
|
|
41
43
|
|
|
42
44
|
def __str__(self) -> str:
|
|
43
45
|
return f"{self.__class__.__name__}:{self.id}"
|
|
44
46
|
|
|
47
|
+
@property
|
|
48
|
+
def resource_id(self) -> str:
|
|
49
|
+
"""Unique resource ID based on name and datacenter for idempotent behavior."""
|
|
50
|
+
if self.name:
|
|
51
|
+
# Use name + datacenter for volumes with names to ensure idempotence
|
|
52
|
+
resource_type = self.__class__.__name__
|
|
53
|
+
config_key = f"{self.name}:{self.dataCenterId.value}"
|
|
54
|
+
hash_obj = hashlib.md5(f"{resource_type}:{config_key}".encode())
|
|
55
|
+
return f"{resource_type}_{hash_obj.hexdigest()}"
|
|
56
|
+
else:
|
|
57
|
+
# Fall back to default behavior for unnamed volumes
|
|
58
|
+
return super().resource_id
|
|
59
|
+
|
|
45
60
|
@field_serializer("dataCenterId")
|
|
46
61
|
def serialize_data_center_id(self, value: Optional[DataCenter]) -> Optional[str]:
|
|
47
62
|
"""Convert DataCenter enum to string."""
|
|
@@ -61,24 +76,57 @@ class NetworkVolume(DeployableResource):
|
|
|
61
76
|
raise ValueError("Network volume ID is not set")
|
|
62
77
|
return f"{CONSOLE_BASE_URL}/user/storage"
|
|
63
78
|
|
|
64
|
-
|
|
79
|
+
def is_deployed(self) -> bool:
|
|
65
80
|
"""
|
|
66
|
-
|
|
67
|
-
Returns the volume ID.
|
|
81
|
+
Checks if the network volume resource is deployed and available.
|
|
68
82
|
"""
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
83
|
+
return self.id is not None
|
|
84
|
+
|
|
85
|
+
def _normalize_volumes_response(self, volumes_response) -> list:
|
|
86
|
+
"""Normalize API response to list format."""
|
|
87
|
+
if isinstance(volumes_response, list):
|
|
88
|
+
return volumes_response
|
|
89
|
+
return volumes_response.get("networkVolumes", [])
|
|
90
|
+
|
|
91
|
+
def _find_matching_volume(self, existing_volumes: list) -> Optional[dict]:
|
|
92
|
+
"""Find existing volume matching name and datacenter."""
|
|
93
|
+
for volume_data in existing_volumes:
|
|
94
|
+
if (
|
|
95
|
+
volume_data.get("name") == self.name
|
|
96
|
+
and volume_data.get("dataCenterId") == self.dataCenterId.value
|
|
97
|
+
):
|
|
98
|
+
return volume_data
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
async def _find_existing_volume(self, client) -> Optional["NetworkVolume"]:
|
|
102
|
+
"""Check for existing volume with same name and datacenter."""
|
|
103
|
+
if not self.name:
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
log.debug(f"Checking for existing network volume with name: {self.name}")
|
|
107
|
+
volumes_response = await client.list_network_volumes()
|
|
108
|
+
existing_volumes = self._normalize_volumes_response(volumes_response)
|
|
109
|
+
|
|
110
|
+
if matching_volume := self._find_matching_volume(existing_volumes):
|
|
111
|
+
log.info(
|
|
112
|
+
f"Found existing network volume: {matching_volume.get('id')} with name '{self.name}'"
|
|
113
|
+
)
|
|
114
|
+
# Update our instance with the existing volume's ID
|
|
115
|
+
self.id = matching_volume.get("id")
|
|
116
|
+
return self
|
|
117
|
+
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
async def _create_new_volume(self, client) -> "NetworkVolume":
|
|
121
|
+
"""Create a new network volume."""
|
|
122
|
+
log.debug(f"Creating new network volume: {self.name or 'unnamed'}")
|
|
123
|
+
payload = self.model_dump(exclude_none=True)
|
|
124
|
+
result = await client.create_network_volume(payload)
|
|
73
125
|
|
|
74
126
|
if volume := self.__class__(**result):
|
|
75
127
|
return volume
|
|
76
128
|
|
|
77
|
-
|
|
78
|
-
"""
|
|
79
|
-
Checks if the network volume resource is deployed and available.
|
|
80
|
-
"""
|
|
81
|
-
return self.id is not None
|
|
129
|
+
raise ValueError("Deployment failed, no volume was created.")
|
|
82
130
|
|
|
83
131
|
async def deploy(self) -> "DeployableResource":
|
|
84
132
|
"""
|
|
@@ -91,16 +139,13 @@ class NetworkVolume(DeployableResource):
|
|
|
91
139
|
log.debug(f"{self} exists")
|
|
92
140
|
return self
|
|
93
141
|
|
|
94
|
-
# Create the network volume
|
|
95
142
|
async with RunpodRestClient() as client:
|
|
96
|
-
#
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
if volume := self.__class__(**result):
|
|
101
|
-
return volume
|
|
143
|
+
# Check for existing volume first
|
|
144
|
+
if existing_volume := await self._find_existing_volume(client):
|
|
145
|
+
return existing_volume
|
|
102
146
|
|
|
103
|
-
|
|
147
|
+
# No existing volume found, create a new one
|
|
148
|
+
return await self._create_new_volume(client)
|
|
104
149
|
|
|
105
150
|
except Exception as e:
|
|
106
151
|
log.error(f"{self} failed to deploy: {e}")
|
|
@@ -134,8 +134,12 @@ class ServerlessResource(DeployableResource):
|
|
|
134
134
|
return value.value if value is not None else None
|
|
135
135
|
|
|
136
136
|
@field_serializer("instanceIds")
|
|
137
|
-
def serialize_instance_ids(
|
|
137
|
+
def serialize_instance_ids(
|
|
138
|
+
self, value: Optional[List[CpuInstanceType]]
|
|
139
|
+
) -> Optional[List[str]]:
|
|
138
140
|
"""Convert CpuInstanceType enums to strings."""
|
|
141
|
+
if value is None:
|
|
142
|
+
return None
|
|
139
143
|
return [item.value if hasattr(item, "value") else str(item) for item in value]
|
|
140
144
|
|
|
141
145
|
@field_validator("gpus")
|
|
@@ -247,62 +251,6 @@ class ServerlessResource(DeployableResource):
|
|
|
247
251
|
log.error(f"{self} failed to deploy: {e}")
|
|
248
252
|
raise
|
|
249
253
|
|
|
250
|
-
async def is_ready_for_requests(self, give_up_threshold=10) -> bool:
|
|
251
|
-
"""
|
|
252
|
-
Asynchronously checks if the serverless resource is ready to handle
|
|
253
|
-
requests by polling its health endpoint.
|
|
254
|
-
|
|
255
|
-
Args:
|
|
256
|
-
give_up_threshold (int, optional): The maximum number of polling
|
|
257
|
-
attempts before giving up and raising an error. Defaults to 10.
|
|
258
|
-
|
|
259
|
-
Returns:
|
|
260
|
-
bool: True if the serverless resource is ready for requests.
|
|
261
|
-
|
|
262
|
-
Raises:
|
|
263
|
-
ValueError: If the serverless resource is not deployed.
|
|
264
|
-
RuntimeError: If the health status is THROTTLED, UNHEALTHY, or UNKNOWN
|
|
265
|
-
after exceeding the give_up_threshold.
|
|
266
|
-
"""
|
|
267
|
-
if not self.is_deployed():
|
|
268
|
-
raise ValueError("Serverless is not deployed")
|
|
269
|
-
|
|
270
|
-
log.debug(f"{self} | API /health")
|
|
271
|
-
|
|
272
|
-
current_pace = 0
|
|
273
|
-
attempt = 0
|
|
274
|
-
|
|
275
|
-
# Poll for health status
|
|
276
|
-
while True:
|
|
277
|
-
await asyncio.sleep(current_pace)
|
|
278
|
-
|
|
279
|
-
health = await asyncio.to_thread(self.endpoint.health)
|
|
280
|
-
health = ServerlessHealth(**health)
|
|
281
|
-
|
|
282
|
-
if health.is_ready:
|
|
283
|
-
return True
|
|
284
|
-
else:
|
|
285
|
-
# nothing changed, increase the gap
|
|
286
|
-
attempt += 1
|
|
287
|
-
indicator = "." * (attempt // 2) if attempt % 2 == 0 else ""
|
|
288
|
-
if indicator:
|
|
289
|
-
log.info(f"{self} | {indicator}")
|
|
290
|
-
|
|
291
|
-
status = health.workers.status
|
|
292
|
-
if status in [
|
|
293
|
-
Status.THROTTLED,
|
|
294
|
-
Status.UNHEALTHY,
|
|
295
|
-
Status.UNKNOWN,
|
|
296
|
-
]:
|
|
297
|
-
log.debug(f"{self} | Health {status.value}")
|
|
298
|
-
|
|
299
|
-
if attempt >= give_up_threshold:
|
|
300
|
-
# Give up
|
|
301
|
-
raise RuntimeError(f"Health {status.value}")
|
|
302
|
-
|
|
303
|
-
# Adjust polling pace appropriately
|
|
304
|
-
current_pace = get_backoff_delay(attempt)
|
|
305
|
-
|
|
306
254
|
async def run_sync(self, payload: Dict[str, Any]) -> "JobOutput":
|
|
307
255
|
"""
|
|
308
256
|
Executes a serverless endpoint request with the payload.
|
|
@@ -319,9 +267,6 @@ class ServerlessResource(DeployableResource):
|
|
|
319
267
|
try:
|
|
320
268
|
# log.debug(f"[{log_group}] Payload: {payload}")
|
|
321
269
|
|
|
322
|
-
# Poll until requests can be sent
|
|
323
|
-
await self.is_ready_for_requests()
|
|
324
|
-
|
|
325
270
|
log.info(f"{self} | API /run_sync")
|
|
326
271
|
response = await asyncio.to_thread(_fetch_job)
|
|
327
272
|
return JobOutput(**response)
|
|
@@ -346,9 +291,6 @@ class ServerlessResource(DeployableResource):
|
|
|
346
291
|
try:
|
|
347
292
|
# log.debug(f"[{self}] Payload: {payload}")
|
|
348
293
|
|
|
349
|
-
# Poll until requests can be sent
|
|
350
|
-
await self.is_ready_for_requests()
|
|
351
|
-
|
|
352
294
|
# Create a job using the endpoint
|
|
353
295
|
log.info(f"{self} | API /run")
|
|
354
296
|
job = await asyncio.to_thread(self.endpoint.run, request_input=payload)
|
|
@@ -366,9 +308,8 @@ class ServerlessResource(DeployableResource):
|
|
|
366
308
|
while True:
|
|
367
309
|
await asyncio.sleep(current_pace)
|
|
368
310
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
job_status = await asyncio.to_thread(job.status)
|
|
311
|
+
# Check job status
|
|
312
|
+
job_status = await asyncio.to_thread(job.status)
|
|
372
313
|
|
|
373
314
|
if last_status == job_status:
|
|
374
315
|
# nothing changed, increase the gap
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Constants for utility modules and caching configurations.
|
|
3
|
+
|
|
4
|
+
This module contains configurable constants used across the tetra-rp codebase
|
|
5
|
+
to ensure consistency and easy maintenance.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Cache key generation constants
|
|
9
|
+
HASH_TRUNCATE_LENGTH = 16 # Length to truncate hash values for cache keys
|
|
10
|
+
UUID_FALLBACK_LENGTH = 8 # Length to truncate UUID values for fallback keys
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LRU Cache implementation using OrderedDict for memory-efficient caching with automatic eviction.
|
|
3
|
+
|
|
4
|
+
This module provides a Least Recently Used (LRU) cache implementation that automatically
|
|
5
|
+
manages memory by evicting the least recently used items when the cache exceeds its
|
|
6
|
+
maximum size limit. It maintains O(1) access time and provides a dict-like interface.
|
|
7
|
+
Thread-safe for concurrent access.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import threading
|
|
11
|
+
from collections import OrderedDict
|
|
12
|
+
from typing import Any, Dict, Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LRUCache:
|
|
16
|
+
"""
|
|
17
|
+
A Least Recently Used (LRU) cache implementation using OrderedDict.
|
|
18
|
+
|
|
19
|
+
Automatically evicts the least recently used items when the cache exceeds
|
|
20
|
+
the maximum size limit. Provides dict-like interface with O(1) operations.
|
|
21
|
+
Thread-safe for concurrent access using RLock.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
max_size: Maximum number of items to store in cache (default: 1000)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, max_size: int = 1000):
|
|
28
|
+
self.max_size = max_size
|
|
29
|
+
self.cache = OrderedDict()
|
|
30
|
+
self._lock = threading.RLock()
|
|
31
|
+
|
|
32
|
+
def get(self, key: str) -> Optional[Dict[str, Any]]:
|
|
33
|
+
"""Get item from cache, moving it to end (most recent) if found."""
|
|
34
|
+
with self._lock:
|
|
35
|
+
if key in self.cache:
|
|
36
|
+
self.cache.move_to_end(key)
|
|
37
|
+
return self.cache[key]
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
def set(self, key: str, value: Dict[str, Any]) -> None:
|
|
41
|
+
"""Set item in cache, evicting oldest if at capacity."""
|
|
42
|
+
with self._lock:
|
|
43
|
+
if key in self.cache:
|
|
44
|
+
self.cache.move_to_end(key)
|
|
45
|
+
else:
|
|
46
|
+
if len(self.cache) >= self.max_size:
|
|
47
|
+
self.cache.popitem(last=False) # Remove oldest
|
|
48
|
+
self.cache[key] = value
|
|
49
|
+
|
|
50
|
+
def clear(self) -> None:
|
|
51
|
+
"""Clear all items from cache."""
|
|
52
|
+
with self._lock:
|
|
53
|
+
self.cache.clear()
|
|
54
|
+
|
|
55
|
+
def __contains__(self, key: str) -> bool:
|
|
56
|
+
"""Check if key exists in cache."""
|
|
57
|
+
with self._lock:
|
|
58
|
+
return key in self.cache
|
|
59
|
+
|
|
60
|
+
def __len__(self) -> int:
|
|
61
|
+
"""Return number of items in cache."""
|
|
62
|
+
with self._lock:
|
|
63
|
+
return len(self.cache)
|
|
64
|
+
|
|
65
|
+
def __getitem__(self, key: str) -> Dict[str, Any]:
|
|
66
|
+
"""Get item using bracket notation, moving to end if found."""
|
|
67
|
+
with self._lock:
|
|
68
|
+
if key in self.cache:
|
|
69
|
+
self.cache.move_to_end(key)
|
|
70
|
+
return self.cache[key]
|
|
71
|
+
raise KeyError(key)
|
|
72
|
+
|
|
73
|
+
def __setitem__(self, key: str, value: Dict[str, Any]) -> None:
|
|
74
|
+
"""Set item using bracket notation."""
|
|
75
|
+
self.set(key, value)
|
tetra_rp/execute_class.py
CHANGED
|
@@ -1,18 +1,97 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Class execution module for remote class instantiation and method calls.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to create and execute remote class instances,
|
|
5
|
+
with automatic caching of class serialization data to improve performance and
|
|
6
|
+
prevent memory leaks through LRU eviction.
|
|
7
|
+
"""
|
|
8
|
+
|
|
1
9
|
import base64
|
|
10
|
+
import hashlib
|
|
2
11
|
import inspect
|
|
3
12
|
import logging
|
|
4
13
|
import textwrap
|
|
5
14
|
import uuid
|
|
6
|
-
from typing import List,
|
|
15
|
+
from typing import List, Optional, Type
|
|
7
16
|
|
|
8
17
|
import cloudpickle
|
|
9
18
|
|
|
10
19
|
from .core.resources import ResourceManager, ServerlessResource
|
|
20
|
+
from .core.utils.constants import HASH_TRUNCATE_LENGTH, UUID_FALLBACK_LENGTH
|
|
21
|
+
from .core.utils.lru_cache import LRUCache
|
|
11
22
|
from .protos.remote_execution import FunctionRequest
|
|
12
23
|
from .stubs import stub_resource
|
|
13
24
|
|
|
14
25
|
log = logging.getLogger(__name__)
|
|
15
26
|
|
|
27
|
+
# Global in-memory cache for serialized class data with LRU eviction
|
|
28
|
+
_SERIALIZED_CLASS_CACHE = LRUCache(max_size=1000)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def serialize_constructor_args(args, kwargs):
|
|
32
|
+
"""Serialize constructor arguments for caching."""
|
|
33
|
+
serialized_args = [
|
|
34
|
+
base64.b64encode(cloudpickle.dumps(arg)).decode("utf-8") for arg in args
|
|
35
|
+
]
|
|
36
|
+
serialized_kwargs = {
|
|
37
|
+
k: base64.b64encode(cloudpickle.dumps(v)).decode("utf-8")
|
|
38
|
+
for k, v in kwargs.items()
|
|
39
|
+
}
|
|
40
|
+
return serialized_args, serialized_kwargs
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_or_cache_class_data(
|
|
44
|
+
cls: Type, args: tuple, kwargs: dict, cache_key: str
|
|
45
|
+
) -> str:
|
|
46
|
+
"""Get class code from cache or extract and cache it."""
|
|
47
|
+
if cache_key not in _SERIALIZED_CLASS_CACHE:
|
|
48
|
+
# Cache miss - extract and cache class code
|
|
49
|
+
clean_class_code = extract_class_code_simple(cls)
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
serialized_args, serialized_kwargs = serialize_constructor_args(
|
|
53
|
+
args, kwargs
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Cache the serialized data
|
|
57
|
+
_SERIALIZED_CLASS_CACHE.set(
|
|
58
|
+
cache_key,
|
|
59
|
+
{
|
|
60
|
+
"class_code": clean_class_code,
|
|
61
|
+
"constructor_args": serialized_args,
|
|
62
|
+
"constructor_kwargs": serialized_kwargs,
|
|
63
|
+
},
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
log.debug(f"Cached class data for {cls.__name__} with key: {cache_key}")
|
|
67
|
+
|
|
68
|
+
except (TypeError, AttributeError, OSError) as e:
|
|
69
|
+
log.warning(
|
|
70
|
+
f"Could not serialize constructor arguments for {cls.__name__}: {e}"
|
|
71
|
+
)
|
|
72
|
+
log.warning(
|
|
73
|
+
f"Skipping constructor argument caching for {cls.__name__} due to unserializable arguments"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Store minimal cache entry to avoid repeated attempts
|
|
77
|
+
_SERIALIZED_CLASS_CACHE.set(
|
|
78
|
+
cache_key,
|
|
79
|
+
{
|
|
80
|
+
"class_code": clean_class_code,
|
|
81
|
+
"constructor_args": None, # Signal that args couldn't be cached
|
|
82
|
+
"constructor_kwargs": None,
|
|
83
|
+
},
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return clean_class_code
|
|
87
|
+
else:
|
|
88
|
+
# Cache hit - retrieve cached data
|
|
89
|
+
cached_data = _SERIALIZED_CLASS_CACHE.get(cache_key)
|
|
90
|
+
log.debug(
|
|
91
|
+
f"Retrieved cached class data for {cls.__name__} with key: {cache_key}"
|
|
92
|
+
)
|
|
93
|
+
return cached_data["class_code"]
|
|
94
|
+
|
|
16
95
|
|
|
17
96
|
def extract_class_code_simple(cls: Type) -> str:
|
|
18
97
|
"""Extract clean class code without decorators and proper indentation"""
|
|
@@ -78,6 +157,46 @@ def extract_class_code_simple(cls: Type) -> str:
|
|
|
78
157
|
return fallback_code
|
|
79
158
|
|
|
80
159
|
|
|
160
|
+
def get_class_cache_key(
|
|
161
|
+
cls: Type, constructor_args: tuple, constructor_kwargs: dict
|
|
162
|
+
) -> str:
|
|
163
|
+
"""Generate a cache key for class serialization based on class source and constructor args.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
cls: The class type to generate a key for
|
|
167
|
+
constructor_args: Positional arguments passed to class constructor
|
|
168
|
+
constructor_kwargs: Keyword arguments passed to class constructor
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
A unique cache key string, or a UUID-based fallback if serialization fails
|
|
172
|
+
|
|
173
|
+
Note:
|
|
174
|
+
Falls back to UUID-based key if constructor arguments cannot be serialized,
|
|
175
|
+
which disables caching benefits but maintains functionality.
|
|
176
|
+
"""
|
|
177
|
+
try:
|
|
178
|
+
# Get class source code for hashing
|
|
179
|
+
class_source = extract_class_code_simple(cls)
|
|
180
|
+
|
|
181
|
+
# Create hash of class source
|
|
182
|
+
class_hash = hashlib.sha256(class_source.encode()).hexdigest()
|
|
183
|
+
|
|
184
|
+
# Create hash of constructor arguments
|
|
185
|
+
args_data = cloudpickle.dumps((constructor_args, constructor_kwargs))
|
|
186
|
+
args_hash = hashlib.sha256(args_data).hexdigest()
|
|
187
|
+
|
|
188
|
+
# Combine hashes for final cache key
|
|
189
|
+
cache_key = f"{cls.__name__}_{class_hash[:HASH_TRUNCATE_LENGTH]}_{args_hash[:HASH_TRUNCATE_LENGTH]}"
|
|
190
|
+
|
|
191
|
+
log.debug(f"Generated cache key for {cls.__name__}: {cache_key}")
|
|
192
|
+
return cache_key
|
|
193
|
+
|
|
194
|
+
except (TypeError, AttributeError, OSError) as e:
|
|
195
|
+
log.warning(f"Could not generate cache key for {cls.__name__}: {e}")
|
|
196
|
+
# Fallback to basic key without caching benefits
|
|
197
|
+
return f"{cls.__name__}_{uuid.uuid4().hex[:UUID_FALLBACK_LENGTH]}"
|
|
198
|
+
|
|
199
|
+
|
|
81
200
|
def create_remote_class(
|
|
82
201
|
cls: Type,
|
|
83
202
|
resource_config: ServerlessResource,
|
|
@@ -103,10 +222,16 @@ def create_remote_class(
|
|
|
103
222
|
self._extra = extra
|
|
104
223
|
self._constructor_args = args
|
|
105
224
|
self._constructor_kwargs = kwargs
|
|
106
|
-
self._instance_id =
|
|
225
|
+
self._instance_id = (
|
|
226
|
+
f"{cls.__name__}_{uuid.uuid4().hex[:UUID_FALLBACK_LENGTH]}"
|
|
227
|
+
)
|
|
107
228
|
self._initialized = False
|
|
108
229
|
|
|
109
|
-
|
|
230
|
+
# Generate cache key and get class code
|
|
231
|
+
self._cache_key = get_class_cache_key(cls, args, kwargs)
|
|
232
|
+
self._clean_class_code = get_or_cache_class_data(
|
|
233
|
+
cls, args, kwargs, self._cache_key
|
|
234
|
+
)
|
|
110
235
|
|
|
111
236
|
log.debug(f"Created remote class wrapper for {cls.__name__}")
|
|
112
237
|
|
|
@@ -136,32 +261,45 @@ def create_remote_class(
|
|
|
136
261
|
async def method_proxy(*args, **kwargs):
|
|
137
262
|
await self._ensure_initialized()
|
|
138
263
|
|
|
139
|
-
#
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
264
|
+
# Get cached data
|
|
265
|
+
cached_data = _SERIALIZED_CLASS_CACHE.get(self._cache_key)
|
|
266
|
+
|
|
267
|
+
# Serialize method arguments (these change per call, so no caching)
|
|
268
|
+
method_args = [
|
|
269
|
+
base64.b64encode(cloudpickle.dumps(arg)).decode("utf-8")
|
|
270
|
+
for arg in args
|
|
271
|
+
]
|
|
272
|
+
method_kwargs = {
|
|
273
|
+
k: base64.b64encode(cloudpickle.dumps(v)).decode("utf-8")
|
|
274
|
+
for k, v in kwargs.items()
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
# Handle constructor args - use cached if available, else serialize fresh
|
|
278
|
+
if cached_data["constructor_args"] is not None:
|
|
279
|
+
# Use cached constructor args
|
|
280
|
+
constructor_args = cached_data["constructor_args"]
|
|
281
|
+
constructor_kwargs = cached_data["constructor_kwargs"]
|
|
282
|
+
else:
|
|
283
|
+
# Constructor args couldn't be cached due to serialization issues
|
|
284
|
+
# Serialize them fresh for each method call (fallback behavior)
|
|
285
|
+
constructor_args = [
|
|
286
|
+
base64.b64encode(cloudpickle.dumps(arg)).decode("utf-8")
|
|
287
|
+
for arg in self._constructor_args
|
|
288
|
+
]
|
|
289
|
+
constructor_kwargs = {
|
|
290
|
+
k: base64.b64encode(cloudpickle.dumps(v)).decode("utf-8")
|
|
291
|
+
for k, v in self._constructor_kwargs.items()
|
|
292
|
+
}
|
|
143
293
|
|
|
144
294
|
request = FunctionRequest(
|
|
145
295
|
execution_type="class",
|
|
146
296
|
class_name=self._class_type.__name__,
|
|
147
|
-
class_code=class_code,
|
|
297
|
+
class_code=cached_data["class_code"],
|
|
148
298
|
method_name=name,
|
|
149
|
-
args=
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
kwargs={
|
|
154
|
-
k: base64.b64encode(cloudpickle.dumps(v)).decode("utf-8")
|
|
155
|
-
for k, v in kwargs.items()
|
|
156
|
-
},
|
|
157
|
-
constructor_args=[
|
|
158
|
-
base64.b64encode(cloudpickle.dumps(arg)).decode("utf-8")
|
|
159
|
-
for arg in self._constructor_args
|
|
160
|
-
],
|
|
161
|
-
constructor_kwargs={
|
|
162
|
-
k: base64.b64encode(cloudpickle.dumps(v)).decode("utf-8")
|
|
163
|
-
for k, v in self._constructor_kwargs.items()
|
|
164
|
-
},
|
|
299
|
+
args=method_args,
|
|
300
|
+
kwargs=method_kwargs,
|
|
301
|
+
constructor_args=constructor_args,
|
|
302
|
+
constructor_kwargs=constructor_kwargs,
|
|
165
303
|
dependencies=self._dependencies,
|
|
166
304
|
system_dependencies=self._system_dependencies,
|
|
167
305
|
instance_id=self._instance_id,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tetra_rp
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: A Python library for distributed inference and serving of machine learning models
|
|
5
5
|
Author-email: Marut Pandya <pandyamarut@gmail.com>, Patrick Rachford <prachford@icloud.com>, Dean Quinanola <dean.quinanola@runpod.io>
|
|
6
6
|
License: MIT
|
|
@@ -11,7 +11,7 @@ Classifier: Operating System :: OS Independent
|
|
|
11
11
|
Requires-Python: <3.14,>=3.9
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
Requires-Dist: cloudpickle>=3.1.1
|
|
14
|
-
Requires-Dist: runpod
|
|
14
|
+
Requires-Dist: runpod
|
|
15
15
|
Requires-Dist: python-dotenv>=1.0.0
|
|
16
16
|
|
|
17
17
|
# Tetra: Serverless computing for AI workloads
|
|
@@ -801,6 +801,6 @@ def fetch_data(url):
|
|
|
801
801
|
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
802
802
|
|
|
803
803
|
<p align="center">
|
|
804
|
-
<a href="https://github.com/
|
|
804
|
+
<a href="https://github.com/runpod/tetra-rp">Tetra</a> •
|
|
805
805
|
<a href="https://runpod.io">Runpod</a>
|
|
806
806
|
</p>
|
|
@@ -1,16 +1,10 @@
|
|
|
1
1
|
tetra_rp/__init__.py,sha256=-1S5sYIKtnUV8V1HlSIbX1yZwiUrsO8J5b3ZEIR_phU,687
|
|
2
2
|
tetra_rp/client.py,sha256=rAMMmn4ejAayFXJMZzx7dG_8Y65tCEMI6wSSKgur4zQ,2500
|
|
3
|
-
tetra_rp/execute_class.py,sha256=
|
|
3
|
+
tetra_rp/execute_class.py,sha256=HoH-qWDA7X6yGvQMwmHn5-MKxbLWHEDEHsuat5dzl2U,11912
|
|
4
4
|
tetra_rp/logger.py,sha256=gk5-PWp3k_GQ5DxndsRkBCX0jarp_3lgZ1oiTFuThQg,1125
|
|
5
5
|
tetra_rp/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
tetra_rp/core/api/__init__.py,sha256=oldrEKMwxYoBPLvPfVlaFS3wfUtTTxCN6-HzlpTh6vE,124
|
|
7
|
-
tetra_rp/core/api/runpod.py,sha256=
|
|
8
|
-
tetra_rp/core/pool/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
tetra_rp/core/pool/cluster_manager.py,sha256=KJxEp_044HjnbOhfIdiXZbks_bFDYE1KgKeR5W9VvbY,6007
|
|
10
|
-
tetra_rp/core/pool/dataclass.py,sha256=YngS328_NTewY8Etitj4k7MmdM5GWqqE_OMbytrVNlw,338
|
|
11
|
-
tetra_rp/core/pool/ex.py,sha256=AZOrn9t_X5ycMl-tDg7-jcIURj_9kVmzn9_da8h1TFI,1273
|
|
12
|
-
tetra_rp/core/pool/job.py,sha256=4bisW_ZwiQ2-qD5l0y9SbHcO4EQvSKimmBBU1fpI_YE,567
|
|
13
|
-
tetra_rp/core/pool/worker.py,sha256=N4cOnf8MiDcPFH2XSMmSnnWMACZYUNnKWVhOx2aSxvM,478
|
|
7
|
+
tetra_rp/core/api/runpod.py,sha256=3TTx1fkXMLZ2R5JCrQYPEn8dhdUsBt8i5OEwAfaKQ_k,10451
|
|
14
8
|
tetra_rp/core/resources/__init__.py,sha256=UhIwo1Y6-tw5qsULamR296sQiztuz-oWrSTreqfmFSw,814
|
|
15
9
|
tetra_rp/core/resources/base.py,sha256=UJeDiFN45aO1n5SBcxn56ohLhj-AWHoj0KO7mF4yJ_o,1440
|
|
16
10
|
tetra_rp/core/resources/cloud.py,sha256=XJOWPfzYlDVJGHxgffcfpEaOKrWhGdi7AzTlaGuYj0o,70
|
|
@@ -18,15 +12,17 @@ tetra_rp/core/resources/constants.py,sha256=F1gPqFaXcCmfrbUSO9PQtUBv984TxFc3pySg
|
|
|
18
12
|
tetra_rp/core/resources/cpu.py,sha256=YIE-tKolSU3JJzpPB7ey-PbRdqKWsJZ_Ad4h2OYaaiA,1231
|
|
19
13
|
tetra_rp/core/resources/environment.py,sha256=FC9kJCa8YLSar75AKUKqJYnNLrUdjZj8ZTOrspBrS00,1267
|
|
20
14
|
tetra_rp/core/resources/gpu.py,sha256=2jIIMr8PNnlIAP8ZTKO8Imx-rdxXp2rbdSHJeVfjawk,1858
|
|
21
|
-
tetra_rp/core/resources/live_serverless.py,sha256=
|
|
22
|
-
tetra_rp/core/resources/network_volume.py,sha256=
|
|
15
|
+
tetra_rp/core/resources/live_serverless.py,sha256=A3JRdCYwHR2KN_OlmTLcv-m_ObxNhBhc5CnUzXOpOtc,1177
|
|
16
|
+
tetra_rp/core/resources/network_volume.py,sha256=h11dRlAkkxrqyNvUP9Eb8BHAUSFQyRP4lNgBdKChezw,5391
|
|
23
17
|
tetra_rp/core/resources/resource_manager.py,sha256=kUVZDblfUzaG78S8FwOzu4rN6QSegUgQNK3fJ_X7l0w,2834
|
|
24
|
-
tetra_rp/core/resources/serverless.py,sha256=
|
|
18
|
+
tetra_rp/core/resources/serverless.py,sha256=48mENAPQrR8fMjWFpb7mpGFOMqjXZnRWGULGH7NPa5E,13629
|
|
25
19
|
tetra_rp/core/resources/template.py,sha256=UkflJXZFWIbQkLuUt4oRLAjn-yIpw9_mT2X1cAH69CU,3141
|
|
26
20
|
tetra_rp/core/resources/utils.py,sha256=mgXfgz_NuHN_IC7TzMNdH9II-LMjxcDCG7syDTcPiGs,1721
|
|
27
21
|
tetra_rp/core/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
22
|
tetra_rp/core/utils/backoff.py,sha256=1pfa0smFNpib8nztcIgBbtrVvQeECKh-aNOfL2TztgU,1324
|
|
23
|
+
tetra_rp/core/utils/constants.py,sha256=Dm4XiO5zTzfdqOSeYVfAjaf2LyHnIEVmbOi_s_k1J_E,375
|
|
29
24
|
tetra_rp/core/utils/json.py,sha256=q0r7aEdfh8kKVeHGeh9fBDfuhHYNopSreislAMB6HhM,1163
|
|
25
|
+
tetra_rp/core/utils/lru_cache.py,sha256=drwKg-DfLbeBRGTzuxKqNKMQq0EuZV15LMTZIOyZuVk,2618
|
|
30
26
|
tetra_rp/core/utils/singleton.py,sha256=JRli0HhBfq4P9mBUOg1TZUUwMvIenRqWdymX3qFMm2k,210
|
|
31
27
|
tetra_rp/protos/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
28
|
tetra_rp/protos/remote_execution.py,sha256=F4uwobnp5q-lX3lR7NCAB23J6OzlzcsB35cezwuoSnI,4638
|
|
@@ -34,7 +30,7 @@ tetra_rp/stubs/__init__.py,sha256=ozKsHs8q0T7o2qhQEquub9hqomh1Htys53mMraaRu2E,72
|
|
|
34
30
|
tetra_rp/stubs/live_serverless.py,sha256=o1NH5XEwUD-27NXJsEGO0IwnuDp8iXwUiw5nZtaZZOI,4199
|
|
35
31
|
tetra_rp/stubs/registry.py,sha256=dmbyC7uBp04_sXsG2wJCloFfFRzYjYQ-naEBKhTRo-U,2839
|
|
36
32
|
tetra_rp/stubs/serverless.py,sha256=BM_a5Ml5VADBYu2WRNmo9qnicP8NnXDGl5ywifulbD0,947
|
|
37
|
-
tetra_rp-0.
|
|
38
|
-
tetra_rp-0.
|
|
39
|
-
tetra_rp-0.
|
|
40
|
-
tetra_rp-0.
|
|
33
|
+
tetra_rp-0.10.0.dist-info/METADATA,sha256=Ck626kHGCXM6r5CHIm9P7gcg1q3IGWhB7Wiw7x0yIJs,28046
|
|
34
|
+
tetra_rp-0.10.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
+
tetra_rp-0.10.0.dist-info/top_level.txt,sha256=bBay7JTDwJXsTYvVjrwno9hnF-j0q272lk65f2AcPjU,9
|
|
36
|
+
tetra_rp-0.10.0.dist-info/RECORD,,
|
tetra_rp/core/pool/__init__.py
DELETED
|
File without changes
|
|
@@ -1,177 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
from worker import Worker
|
|
3
|
-
from job import Job
|
|
4
|
-
|
|
5
|
-
from dataclass import WorkerStatus, JobStatus
|
|
6
|
-
|
|
7
|
-
import logging
|
|
8
|
-
import inspect
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def setup_logging(level=logging.INFO, fmt=None):
|
|
12
|
-
if fmt is None:
|
|
13
|
-
fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
14
|
-
logging.basicConfig(level=level, format=fmt)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def get_logger(name=None):
|
|
18
|
-
"""
|
|
19
|
-
Returns a logger. If no name is provided, it infers the caller's module name.
|
|
20
|
-
"""
|
|
21
|
-
if name is None:
|
|
22
|
-
# Get the caller's module name.
|
|
23
|
-
frame = inspect.stack()[1]
|
|
24
|
-
module = inspect.getmodule(frame[0])
|
|
25
|
-
name = module.__name__ if module else "__main__"
|
|
26
|
-
return logging.getLogger(name)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
logger = get_logger(__name__)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class ClusterManager:
|
|
33
|
-
"""
|
|
34
|
-
Manages workers and Jobs currently in Memory:
|
|
35
|
-
- Runpod for provisioning
|
|
36
|
-
- Real remote execution
|
|
37
|
-
- Data base for the
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
def __init__(self):
|
|
41
|
-
self.workers = {} # Worker ID -> Worker
|
|
42
|
-
self.jobs = {} # Job ID -> Job
|
|
43
|
-
|
|
44
|
-
# ----------------- Worker Management -----------------
|
|
45
|
-
# ------------------------------------------------------
|
|
46
|
-
def add_worker(self, resource_config: dict):
|
|
47
|
-
"""
|
|
48
|
-
Add a new worker to the cluster
|
|
49
|
-
"""
|
|
50
|
-
# here will go the logic to create a worker and add it to the cluster: RUNPOD LOGIC will be added here.
|
|
51
|
-
worker = Worker(resource_config)
|
|
52
|
-
self.workers[worker.worker_id] = worker
|
|
53
|
-
|
|
54
|
-
logger.info(f"Added worker {worker.worker_id} to the cluster")
|
|
55
|
-
return worker.worker_id
|
|
56
|
-
|
|
57
|
-
def remove_worker(self, worker_id):
|
|
58
|
-
"""
|
|
59
|
-
Remove a worker from the cluster
|
|
60
|
-
"""
|
|
61
|
-
worker = self.workers.get(worker_id)
|
|
62
|
-
if not worker:
|
|
63
|
-
logger.error(f"Worker {worker_id} not found")
|
|
64
|
-
return False
|
|
65
|
-
if worker.status == WorkerStatus.RUNNING:
|
|
66
|
-
logger.error(f"Worker {worker_id} is still running")
|
|
67
|
-
return False
|
|
68
|
-
del self.workers[worker_id]
|
|
69
|
-
logger.info(f"Removed worker {worker_id} from the cluster")
|
|
70
|
-
return True
|
|
71
|
-
|
|
72
|
-
def list_workers(self):
|
|
73
|
-
"""
|
|
74
|
-
List all workers in the cluster
|
|
75
|
-
"""
|
|
76
|
-
return list(self.workers.values())
|
|
77
|
-
|
|
78
|
-
# ----------------- Job Management -----------------
|
|
79
|
-
# ---------------------------------------------------
|
|
80
|
-
|
|
81
|
-
def submit_job(self, resource_config: dict):
|
|
82
|
-
"""
|
|
83
|
-
Submit a new job to the cluster (Queueud). Then attempt to scheduel it.
|
|
84
|
-
"""
|
|
85
|
-
job = Job(resource_config)
|
|
86
|
-
self.jobs[job.job_id] = job
|
|
87
|
-
logger.info(f"Submitted job {job.job_id} to the cluster")
|
|
88
|
-
# attempt to schedule the job
|
|
89
|
-
self.schedule_job(job)
|
|
90
|
-
return job.job_id
|
|
91
|
-
|
|
92
|
-
def schedule_job(self, job: Job):
|
|
93
|
-
"""
|
|
94
|
-
find a suitable worker for the job. It none, Job remains queued.
|
|
95
|
-
If we want to a auto provision we can actually add a logic here to add a worker if none is available.
|
|
96
|
-
"""
|
|
97
|
-
if job.status != JobStatus.QUEUED:
|
|
98
|
-
logger.error(f"Job {job.job_id} is not pending")
|
|
99
|
-
return False
|
|
100
|
-
|
|
101
|
-
# Find worker candidate
|
|
102
|
-
candidate = self.find_idle_worker(job.resource_config)
|
|
103
|
-
if candidate:
|
|
104
|
-
self.assign_job_to_worker(job, candidate)
|
|
105
|
-
else:
|
|
106
|
-
logger.info(f"No worker available for job {job.job_id}")
|
|
107
|
-
# we cn either provision new worker from here and then scehediule the job from here.
|
|
108
|
-
|
|
109
|
-
def find_idle_worker(self, resource_config: dict):
|
|
110
|
-
"""
|
|
111
|
-
Find an idle worker that can run the job
|
|
112
|
-
"""
|
|
113
|
-
for w in self.workers.values():
|
|
114
|
-
if w.status == WorkerStatus.IDLE:
|
|
115
|
-
# check the resource config
|
|
116
|
-
if w.resource_config == resource_config:
|
|
117
|
-
continue
|
|
118
|
-
return w
|
|
119
|
-
return None
|
|
120
|
-
|
|
121
|
-
def assign_job_to_worker(self, job: Job, worker: Worker):
|
|
122
|
-
"""
|
|
123
|
-
Mark the job as running and the worker as Running and 'execute' the job.
|
|
124
|
-
In a real system, we would send a remote command to the worker (eg: gRPC) to execute the job.
|
|
125
|
-
"""
|
|
126
|
-
job.worker_id = worker.worker_id
|
|
127
|
-
job.status = JobStatus.RUNNING
|
|
128
|
-
worker.status = WorkerStatus.RUNNING
|
|
129
|
-
worker.current_job_id = job.job_id
|
|
130
|
-
logger.info(f"Assigned job {job.job_id} to worker {worker.worker_id}")
|
|
131
|
-
self._execute_job(job, worker)
|
|
132
|
-
|
|
133
|
-
def _execute_job(self, job: Job, worker: Worker):
|
|
134
|
-
"""
|
|
135
|
-
Simulate the remote execution. right now, we jsut sleep for 1s.
|
|
136
|
-
In production, what we we can do is:
|
|
137
|
-
- Open a gRPC connection to the worker
|
|
138
|
-
- pass the job details
|
|
139
|
-
- wait for the compeltion call back
|
|
140
|
-
"""
|
|
141
|
-
try:
|
|
142
|
-
logger.info(f"Executing job {job.job_id} on worker {worker.worker_id}")
|
|
143
|
-
time.sleep(
|
|
144
|
-
1
|
|
145
|
-
) # Here we can add the actual execution logic, currently it mimics the execution.
|
|
146
|
-
|
|
147
|
-
# mark the job as completed
|
|
148
|
-
job.status = JobStatus.COMPLETED
|
|
149
|
-
job.result = "Job completed successfully"
|
|
150
|
-
logger.info(f"[Cluster Manager] Job {job.job_id} completed successfully")
|
|
151
|
-
except Exception as e:
|
|
152
|
-
job.status = JobStatus.FAILED
|
|
153
|
-
job.result = f"Job failed: {str(e)}"
|
|
154
|
-
logger.error(f"[Cluster Manager] Job {job.job_id} failed: {str(e)}")
|
|
155
|
-
finally:
|
|
156
|
-
worker.status = WorkerStatus.IDLE
|
|
157
|
-
worker.current_job_id = None
|
|
158
|
-
|
|
159
|
-
def get_job_status(self, job_id):
|
|
160
|
-
"""
|
|
161
|
-
Get the job details
|
|
162
|
-
"""
|
|
163
|
-
job = self.jobs.get(job_id)
|
|
164
|
-
if not job:
|
|
165
|
-
logger.error(f"Job {job_id} not found")
|
|
166
|
-
return None
|
|
167
|
-
return job
|
|
168
|
-
|
|
169
|
-
# this function has retry logic but it's currently fuzzy, we might have to change it.
|
|
170
|
-
|
|
171
|
-
def retry_queued_jobs(self):
|
|
172
|
-
"""
|
|
173
|
-
Retry all queued jobs
|
|
174
|
-
"""
|
|
175
|
-
for job in self.jobs.values():
|
|
176
|
-
if job.status == JobStatus.QUEUED:
|
|
177
|
-
self.schedule_job(job)
|
tetra_rp/core/pool/dataclass.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
from enum import Enum
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class WorkerStatus(Enum):
|
|
5
|
-
"""Enum representing the status of a worker"""
|
|
6
|
-
|
|
7
|
-
IDLE = "idle"
|
|
8
|
-
RUNNING = "running"
|
|
9
|
-
OFFLINE = "offline"
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class JobStatus(Enum):
|
|
13
|
-
"""Enum representing the status of a job"""
|
|
14
|
-
|
|
15
|
-
QUEUED = "queued"
|
|
16
|
-
RUNNING = "running"
|
|
17
|
-
COMPLETED = "completed"
|
|
18
|
-
FAILED = "failed"
|
tetra_rp/core/pool/ex.py
DELETED
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
from cluster_manager import ClusterManager
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
if __name__ == "__main__":
|
|
5
|
-
cm = ClusterManager()
|
|
6
|
-
|
|
7
|
-
# 1) Submit a job with no existing workers (use resource_config dict)
|
|
8
|
-
job_id = cm.submit_job(
|
|
9
|
-
resource_config={"gpu": "H100", "memory": 16, "network_volume": 50}
|
|
10
|
-
)
|
|
11
|
-
print(
|
|
12
|
-
"Job status:", cm.get_job_status(job_id)
|
|
13
|
-
) # should be QUEUED, no suitable worker
|
|
14
|
-
|
|
15
|
-
# 2) Add a worker that doesn't match the GPU
|
|
16
|
-
w1 = cm.add_worker(
|
|
17
|
-
resource_config={"gpu": "H100", "memory": 16, "network_volume": 50}
|
|
18
|
-
)
|
|
19
|
-
# Re-try scheduling
|
|
20
|
-
cm.retry_queued_jobs()
|
|
21
|
-
print("Job status (still queued):", cm.get_job_status(job_id))
|
|
22
|
-
|
|
23
|
-
# 3) Add a matching worker
|
|
24
|
-
w2 = cm.add_worker(
|
|
25
|
-
resource_config={"gpu": "H100", "memory": 16, "network_volume": 50}
|
|
26
|
-
)
|
|
27
|
-
# Re-try scheduling
|
|
28
|
-
cm.retry_queued_jobs()
|
|
29
|
-
print("Job status (should complete):", cm.get_job_status(job_id))
|
|
30
|
-
|
|
31
|
-
# 4) Submit another job that requires less resources
|
|
32
|
-
job_id2 = cm.submit_job(resource_config={"memory": 8, "network_volume": 10})
|
|
33
|
-
# Should be assigned to w1 if it's idle
|
|
34
|
-
print("Job2 final status:", cm.get_job_status(job_id2))
|
|
35
|
-
|
|
36
|
-
# 5) Show final state of workers
|
|
37
|
-
for worker in cm.list_workers():
|
|
38
|
-
print("Worker:", worker)
|
tetra_rp/core/pool/job.py
DELETED
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
import uuid
|
|
2
|
-
from dataclass import JobStatus
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class Job:
|
|
6
|
-
"""Represents a 'job' in the system
|
|
7
|
-
|
|
8
|
-
In a real system, this might contain the function to run,
|
|
9
|
-
arguments, and reference to data or code.
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
def __init__(self, resource_config: dict):
|
|
13
|
-
self.job_id = str(uuid.uuid4())[:8]
|
|
14
|
-
self.resource_config = resource_config
|
|
15
|
-
self.status = JobStatus.QUEUED
|
|
16
|
-
|
|
17
|
-
self.worker_id = None
|
|
18
|
-
self.result = None
|
|
19
|
-
self.error = None
|
|
20
|
-
|
|
21
|
-
def __repr__(self):
|
|
22
|
-
return f"Job(job_id={self.job_id}, status={self.status})"
|
tetra_rp/core/pool/worker.py
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
import uuid
|
|
2
|
-
from dataclass import WorkerStatus
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class Worker:
|
|
6
|
-
"""Represents a single worker in the pool
|
|
7
|
-
|
|
8
|
-
For Now we store ressources in memory
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
def __init__(self, resource_config: dict):
|
|
12
|
-
self.worker_id = str(uuid.uuid4())[:8]
|
|
13
|
-
self.resource_config = resource_config
|
|
14
|
-
self.status = WorkerStatus.IDLE
|
|
15
|
-
|
|
16
|
-
self.current_job_id = None
|
|
17
|
-
|
|
18
|
-
def __repr__(self):
|
|
19
|
-
return f"Worker(worker_id={self.worker_id}, status={self.status})"
|
|
File without changes
|
|
File without changes
|