tetra-rp 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -281,6 +281,30 @@ class RunpodRestClient:
281
281
 
282
282
  return result
283
283
 
284
+ async def list_network_volumes(self) -> Dict[str, Any]:
285
+ """
286
+ List all network volumes in Runpod.
287
+
288
+ Returns:
289
+ List of network volume objects or dict containing networkVolumes key.
290
+ The API may return either format depending on version.
291
+ """
292
+ log.debug("Listing network volumes")
293
+
294
+ result = await self._execute_rest(
295
+ "GET", f"{RUNPOD_REST_API_URL}/networkvolumes"
296
+ )
297
+
298
+ # Handle both list and dict responses
299
+ if isinstance(result, list):
300
+ volume_count = len(result)
301
+ else:
302
+ volume_count = len(result.get("networkVolumes", []))
303
+
304
+ log.debug(f"Listed {volume_count} network volumes")
305
+
306
+ return result
307
+
284
308
  async def close(self):
285
309
  """Close the HTTP session."""
286
310
  if self.session and not self.session.closed:
@@ -3,9 +3,13 @@ import os
3
3
  from pydantic import model_validator
4
4
  from .serverless import ServerlessEndpoint
5
5
 
6
-
7
- TETRA_GPU_IMAGE = os.environ.get("TETRA_GPU_IMAGE", "runpod/tetra-rp:dev")
8
- TETRA_CPU_IMAGE = os.environ.get("TETRA_CPU_IMAGE", "runpod/tetra-rp-cpu:dev")
6
+ TETRA_IMAGE_TAG = os.environ.get("TETRA_IMAGE_TAG", "latest")
7
+ TETRA_GPU_IMAGE = os.environ.get(
8
+ "TETRA_GPU_IMAGE", f"runpod/tetra-rp:{TETRA_IMAGE_TAG}"
9
+ )
10
+ TETRA_CPU_IMAGE = os.environ.get(
11
+ "TETRA_CPU_IMAGE", f"runpod/tetra-rp-cpu:{TETRA_IMAGE_TAG}"
12
+ )
9
13
 
10
14
 
11
15
  class LiveServerless(ServerlessEndpoint):
@@ -1,3 +1,4 @@
1
+ import hashlib
1
2
  import logging
2
3
  from enum import Enum
3
4
  from typing import Optional
@@ -25,10 +26,11 @@ class DataCenter(str, Enum):
25
26
 
26
27
  class NetworkVolume(DeployableResource):
27
28
  """
28
- NetworkVolume resource for creating and managing Runpod netowrk volumes.
29
+ NetworkVolume resource for creating and managing Runpod network volumes.
29
30
 
30
31
  This class handles the creation, deployment, and management of network volumes
31
- that can be attached to serverless resources.
32
+ that can be attached to serverless resources. Supports idempotent deployment
33
+ where multiple volumes with the same name will reuse existing volumes.
32
34
 
33
35
  """
34
36
 
@@ -37,11 +39,24 @@ class NetworkVolume(DeployableResource):
37
39
 
38
40
  id: Optional[str] = Field(default=None)
39
41
  name: Optional[str] = None
40
- size: Optional[int] = Field(default=10, gt=0) # Size in GB
42
+ size: Optional[int] = Field(default=50, gt=0) # Size in GB
41
43
 
42
44
  def __str__(self) -> str:
43
45
  return f"{self.__class__.__name__}:{self.id}"
44
46
 
47
+ @property
48
+ def resource_id(self) -> str:
49
+ """Unique resource ID based on name and datacenter for idempotent behavior."""
50
+ if self.name:
51
+ # Use name + datacenter for volumes with names to ensure idempotence
52
+ resource_type = self.__class__.__name__
53
+ config_key = f"{self.name}:{self.dataCenterId.value}"
54
+ hash_obj = hashlib.md5(f"{resource_type}:{config_key}".encode())
55
+ return f"{resource_type}_{hash_obj.hexdigest()}"
56
+ else:
57
+ # Fall back to default behavior for unnamed volumes
58
+ return super().resource_id
59
+
45
60
  @field_serializer("dataCenterId")
46
61
  def serialize_data_center_id(self, value: Optional[DataCenter]) -> Optional[str]:
47
62
  """Convert DataCenter enum to string."""
@@ -61,24 +76,57 @@ class NetworkVolume(DeployableResource):
61
76
  raise ValueError("Network volume ID is not set")
62
77
  return f"{CONSOLE_BASE_URL}/user/storage"
63
78
 
64
- async def create_network_volume(self) -> str:
79
+ def is_deployed(self) -> bool:
65
80
  """
66
- Creates a network volume using the provided configuration.
67
- Returns the volume ID.
81
+ Checks if the network volume resource is deployed and available.
68
82
  """
69
- async with RunpodRestClient() as client:
70
- # Create the network volume
71
- payload = self.model_dump(exclude_none=True)
72
- result = await client.create_network_volume(payload)
83
+ return self.id is not None
84
+
85
+ def _normalize_volumes_response(self, volumes_response) -> list:
86
+ """Normalize API response to list format."""
87
+ if isinstance(volumes_response, list):
88
+ return volumes_response
89
+ return volumes_response.get("networkVolumes", [])
90
+
91
+ def _find_matching_volume(self, existing_volumes: list) -> Optional[dict]:
92
+ """Find existing volume matching name and datacenter."""
93
+ for volume_data in existing_volumes:
94
+ if (
95
+ volume_data.get("name") == self.name
96
+ and volume_data.get("dataCenterId") == self.dataCenterId.value
97
+ ):
98
+ return volume_data
99
+ return None
100
+
101
+ async def _find_existing_volume(self, client) -> Optional["NetworkVolume"]:
102
+ """Check for existing volume with same name and datacenter."""
103
+ if not self.name:
104
+ return None
105
+
106
+ log.debug(f"Checking for existing network volume with name: {self.name}")
107
+ volumes_response = await client.list_network_volumes()
108
+ existing_volumes = self._normalize_volumes_response(volumes_response)
109
+
110
+ if matching_volume := self._find_matching_volume(existing_volumes):
111
+ log.info(
112
+ f"Found existing network volume: {matching_volume.get('id')} with name '{self.name}'"
113
+ )
114
+ # Update our instance with the existing volume's ID
115
+ self.id = matching_volume.get("id")
116
+ return self
117
+
118
+ return None
119
+
120
+ async def _create_new_volume(self, client) -> "NetworkVolume":
121
+ """Create a new network volume."""
122
+ log.debug(f"Creating new network volume: {self.name or 'unnamed'}")
123
+ payload = self.model_dump(exclude_none=True)
124
+ result = await client.create_network_volume(payload)
73
125
 
74
126
  if volume := self.__class__(**result):
75
127
  return volume
76
128
 
77
- def is_deployed(self) -> bool:
78
- """
79
- Checks if the network volume resource is deployed and available.
80
- """
81
- return self.id is not None
129
+ raise ValueError("Deployment failed, no volume was created.")
82
130
 
83
131
  async def deploy(self) -> "DeployableResource":
84
132
  """
@@ -91,16 +139,13 @@ class NetworkVolume(DeployableResource):
91
139
  log.debug(f"{self} exists")
92
140
  return self
93
141
 
94
- # Create the network volume
95
142
  async with RunpodRestClient() as client:
96
- # Create the network volume
97
- payload = self.model_dump(exclude_none=True)
98
- result = await client.create_network_volume(payload)
99
-
100
- if volume := self.__class__(**result):
101
- return volume
143
+ # Check for existing volume first
144
+ if existing_volume := await self._find_existing_volume(client):
145
+ return existing_volume
102
146
 
103
- raise ValueError("Deployment failed, no volume was created.")
147
+ # No existing volume found, create a new one
148
+ return await self._create_new_volume(client)
104
149
 
105
150
  except Exception as e:
106
151
  log.error(f"{self} failed to deploy: {e}")
@@ -134,8 +134,12 @@ class ServerlessResource(DeployableResource):
134
134
  return value.value if value is not None else None
135
135
 
136
136
  @field_serializer("instanceIds")
137
- def serialize_instance_ids(self, value: List[CpuInstanceType]) -> List[str]:
137
+ def serialize_instance_ids(
138
+ self, value: Optional[List[CpuInstanceType]]
139
+ ) -> Optional[List[str]]:
138
140
  """Convert CpuInstanceType enums to strings."""
141
+ if value is None:
142
+ return None
139
143
  return [item.value if hasattr(item, "value") else str(item) for item in value]
140
144
 
141
145
  @field_validator("gpus")
@@ -247,62 +251,6 @@ class ServerlessResource(DeployableResource):
247
251
  log.error(f"{self} failed to deploy: {e}")
248
252
  raise
249
253
 
250
- async def is_ready_for_requests(self, give_up_threshold=10) -> bool:
251
- """
252
- Asynchronously checks if the serverless resource is ready to handle
253
- requests by polling its health endpoint.
254
-
255
- Args:
256
- give_up_threshold (int, optional): The maximum number of polling
257
- attempts before giving up and raising an error. Defaults to 10.
258
-
259
- Returns:
260
- bool: True if the serverless resource is ready for requests.
261
-
262
- Raises:
263
- ValueError: If the serverless resource is not deployed.
264
- RuntimeError: If the health status is THROTTLED, UNHEALTHY, or UNKNOWN
265
- after exceeding the give_up_threshold.
266
- """
267
- if not self.is_deployed():
268
- raise ValueError("Serverless is not deployed")
269
-
270
- log.debug(f"{self} | API /health")
271
-
272
- current_pace = 0
273
- attempt = 0
274
-
275
- # Poll for health status
276
- while True:
277
- await asyncio.sleep(current_pace)
278
-
279
- health = await asyncio.to_thread(self.endpoint.health)
280
- health = ServerlessHealth(**health)
281
-
282
- if health.is_ready:
283
- return True
284
- else:
285
- # nothing changed, increase the gap
286
- attempt += 1
287
- indicator = "." * (attempt // 2) if attempt % 2 == 0 else ""
288
- if indicator:
289
- log.info(f"{self} | {indicator}")
290
-
291
- status = health.workers.status
292
- if status in [
293
- Status.THROTTLED,
294
- Status.UNHEALTHY,
295
- Status.UNKNOWN,
296
- ]:
297
- log.debug(f"{self} | Health {status.value}")
298
-
299
- if attempt >= give_up_threshold:
300
- # Give up
301
- raise RuntimeError(f"Health {status.value}")
302
-
303
- # Adjust polling pace appropriately
304
- current_pace = get_backoff_delay(attempt)
305
-
306
254
  async def run_sync(self, payload: Dict[str, Any]) -> "JobOutput":
307
255
  """
308
256
  Executes a serverless endpoint request with the payload.
@@ -319,9 +267,6 @@ class ServerlessResource(DeployableResource):
319
267
  try:
320
268
  # log.debug(f"[{log_group}] Payload: {payload}")
321
269
 
322
- # Poll until requests can be sent
323
- await self.is_ready_for_requests()
324
-
325
270
  log.info(f"{self} | API /run_sync")
326
271
  response = await asyncio.to_thread(_fetch_job)
327
272
  return JobOutput(**response)
@@ -346,9 +291,6 @@ class ServerlessResource(DeployableResource):
346
291
  try:
347
292
  # log.debug(f"[{self}] Payload: {payload}")
348
293
 
349
- # Poll until requests can be sent
350
- await self.is_ready_for_requests()
351
-
352
294
  # Create a job using the endpoint
353
295
  log.info(f"{self} | API /run")
354
296
  job = await asyncio.to_thread(self.endpoint.run, request_input=payload)
@@ -366,9 +308,8 @@ class ServerlessResource(DeployableResource):
366
308
  while True:
367
309
  await asyncio.sleep(current_pace)
368
310
 
369
- if await self.is_ready_for_requests():
370
- # Check job status
371
- job_status = await asyncio.to_thread(job.status)
311
+ # Check job status
312
+ job_status = await asyncio.to_thread(job.status)
372
313
 
373
314
  if last_status == job_status:
374
315
  # nothing changed, increase the gap
@@ -0,0 +1,10 @@
1
+ """
2
+ Constants for utility modules and caching configurations.
3
+
4
+ This module contains configurable constants used across the tetra-rp codebase
5
+ to ensure consistency and easy maintenance.
6
+ """
7
+
8
+ # Cache key generation constants
9
+ HASH_TRUNCATE_LENGTH = 16 # Length to truncate hash values for cache keys
10
+ UUID_FALLBACK_LENGTH = 8 # Length to truncate UUID values for fallback keys
@@ -0,0 +1,75 @@
1
+ """
2
+ LRU Cache implementation using OrderedDict for memory-efficient caching with automatic eviction.
3
+
4
+ This module provides a Least Recently Used (LRU) cache implementation that automatically
5
+ manages memory by evicting the least recently used items when the cache exceeds its
6
+ maximum size limit. It maintains O(1) access time and provides a dict-like interface.
7
+ Thread-safe for concurrent access.
8
+ """
9
+
10
+ import threading
11
+ from collections import OrderedDict
12
+ from typing import Any, Dict, Optional
13
+
14
+
15
+ class LRUCache:
16
+ """
17
+ A Least Recently Used (LRU) cache implementation using OrderedDict.
18
+
19
+ Automatically evicts the least recently used items when the cache exceeds
20
+ the maximum size limit. Provides dict-like interface with O(1) operations.
21
+ Thread-safe for concurrent access using RLock.
22
+
23
+ Args:
24
+ max_size: Maximum number of items to store in cache (default: 1000)
25
+ """
26
+
27
+ def __init__(self, max_size: int = 1000):
28
+ self.max_size = max_size
29
+ self.cache = OrderedDict()
30
+ self._lock = threading.RLock()
31
+
32
+ def get(self, key: str) -> Optional[Dict[str, Any]]:
33
+ """Get item from cache, moving it to end (most recent) if found."""
34
+ with self._lock:
35
+ if key in self.cache:
36
+ self.cache.move_to_end(key)
37
+ return self.cache[key]
38
+ return None
39
+
40
+ def set(self, key: str, value: Dict[str, Any]) -> None:
41
+ """Set item in cache, evicting oldest if at capacity."""
42
+ with self._lock:
43
+ if key in self.cache:
44
+ self.cache.move_to_end(key)
45
+ else:
46
+ if len(self.cache) >= self.max_size:
47
+ self.cache.popitem(last=False) # Remove oldest
48
+ self.cache[key] = value
49
+
50
+ def clear(self) -> None:
51
+ """Clear all items from cache."""
52
+ with self._lock:
53
+ self.cache.clear()
54
+
55
+ def __contains__(self, key: str) -> bool:
56
+ """Check if key exists in cache."""
57
+ with self._lock:
58
+ return key in self.cache
59
+
60
+ def __len__(self) -> int:
61
+ """Return number of items in cache."""
62
+ with self._lock:
63
+ return len(self.cache)
64
+
65
+ def __getitem__(self, key: str) -> Dict[str, Any]:
66
+ """Get item using bracket notation, moving to end if found."""
67
+ with self._lock:
68
+ if key in self.cache:
69
+ self.cache.move_to_end(key)
70
+ return self.cache[key]
71
+ raise KeyError(key)
72
+
73
+ def __setitem__(self, key: str, value: Dict[str, Any]) -> None:
74
+ """Set item using bracket notation."""
75
+ self.set(key, value)
tetra_rp/execute_class.py CHANGED
@@ -1,18 +1,97 @@
1
+ """
2
+ Class execution module for remote class instantiation and method calls.
3
+
4
+ This module provides functionality to create and execute remote class instances,
5
+ with automatic caching of class serialization data to improve performance and
6
+ prevent memory leaks through LRU eviction.
7
+ """
8
+
1
9
  import base64
10
+ import hashlib
2
11
  import inspect
3
12
  import logging
4
13
  import textwrap
5
14
  import uuid
6
- from typing import List, Type, Optional
15
+ from typing import List, Optional, Type
7
16
 
8
17
  import cloudpickle
9
18
 
10
19
  from .core.resources import ResourceManager, ServerlessResource
20
+ from .core.utils.constants import HASH_TRUNCATE_LENGTH, UUID_FALLBACK_LENGTH
21
+ from .core.utils.lru_cache import LRUCache
11
22
  from .protos.remote_execution import FunctionRequest
12
23
  from .stubs import stub_resource
13
24
 
14
25
  log = logging.getLogger(__name__)
15
26
 
27
+ # Global in-memory cache for serialized class data with LRU eviction
28
+ _SERIALIZED_CLASS_CACHE = LRUCache(max_size=1000)
29
+
30
+
31
+ def serialize_constructor_args(args, kwargs):
32
+ """Serialize constructor arguments for caching."""
33
+ serialized_args = [
34
+ base64.b64encode(cloudpickle.dumps(arg)).decode("utf-8") for arg in args
35
+ ]
36
+ serialized_kwargs = {
37
+ k: base64.b64encode(cloudpickle.dumps(v)).decode("utf-8")
38
+ for k, v in kwargs.items()
39
+ }
40
+ return serialized_args, serialized_kwargs
41
+
42
+
43
+ def get_or_cache_class_data(
44
+ cls: Type, args: tuple, kwargs: dict, cache_key: str
45
+ ) -> str:
46
+ """Get class code from cache or extract and cache it."""
47
+ if cache_key not in _SERIALIZED_CLASS_CACHE:
48
+ # Cache miss - extract and cache class code
49
+ clean_class_code = extract_class_code_simple(cls)
50
+
51
+ try:
52
+ serialized_args, serialized_kwargs = serialize_constructor_args(
53
+ args, kwargs
54
+ )
55
+
56
+ # Cache the serialized data
57
+ _SERIALIZED_CLASS_CACHE.set(
58
+ cache_key,
59
+ {
60
+ "class_code": clean_class_code,
61
+ "constructor_args": serialized_args,
62
+ "constructor_kwargs": serialized_kwargs,
63
+ },
64
+ )
65
+
66
+ log.debug(f"Cached class data for {cls.__name__} with key: {cache_key}")
67
+
68
+ except (TypeError, AttributeError, OSError) as e:
69
+ log.warning(
70
+ f"Could not serialize constructor arguments for {cls.__name__}: {e}"
71
+ )
72
+ log.warning(
73
+ f"Skipping constructor argument caching for {cls.__name__} due to unserializable arguments"
74
+ )
75
+
76
+ # Store minimal cache entry to avoid repeated attempts
77
+ _SERIALIZED_CLASS_CACHE.set(
78
+ cache_key,
79
+ {
80
+ "class_code": clean_class_code,
81
+ "constructor_args": None, # Signal that args couldn't be cached
82
+ "constructor_kwargs": None,
83
+ },
84
+ )
85
+
86
+ return clean_class_code
87
+ else:
88
+ # Cache hit - retrieve cached data
89
+ cached_data = _SERIALIZED_CLASS_CACHE.get(cache_key)
90
+ log.debug(
91
+ f"Retrieved cached class data for {cls.__name__} with key: {cache_key}"
92
+ )
93
+ return cached_data["class_code"]
94
+
16
95
 
17
96
  def extract_class_code_simple(cls: Type) -> str:
18
97
  """Extract clean class code without decorators and proper indentation"""
@@ -78,6 +157,46 @@ def extract_class_code_simple(cls: Type) -> str:
78
157
  return fallback_code
79
158
 
80
159
 
160
+ def get_class_cache_key(
161
+ cls: Type, constructor_args: tuple, constructor_kwargs: dict
162
+ ) -> str:
163
+ """Generate a cache key for class serialization based on class source and constructor args.
164
+
165
+ Args:
166
+ cls: The class type to generate a key for
167
+ constructor_args: Positional arguments passed to class constructor
168
+ constructor_kwargs: Keyword arguments passed to class constructor
169
+
170
+ Returns:
171
+ A unique cache key string, or a UUID-based fallback if serialization fails
172
+
173
+ Note:
174
+ Falls back to UUID-based key if constructor arguments cannot be serialized,
175
+ which disables caching benefits but maintains functionality.
176
+ """
177
+ try:
178
+ # Get class source code for hashing
179
+ class_source = extract_class_code_simple(cls)
180
+
181
+ # Create hash of class source
182
+ class_hash = hashlib.sha256(class_source.encode()).hexdigest()
183
+
184
+ # Create hash of constructor arguments
185
+ args_data = cloudpickle.dumps((constructor_args, constructor_kwargs))
186
+ args_hash = hashlib.sha256(args_data).hexdigest()
187
+
188
+ # Combine hashes for final cache key
189
+ cache_key = f"{cls.__name__}_{class_hash[:HASH_TRUNCATE_LENGTH]}_{args_hash[:HASH_TRUNCATE_LENGTH]}"
190
+
191
+ log.debug(f"Generated cache key for {cls.__name__}: {cache_key}")
192
+ return cache_key
193
+
194
+ except (TypeError, AttributeError, OSError) as e:
195
+ log.warning(f"Could not generate cache key for {cls.__name__}: {e}")
196
+ # Fallback to basic key without caching benefits
197
+ return f"{cls.__name__}_{uuid.uuid4().hex[:UUID_FALLBACK_LENGTH]}"
198
+
199
+
81
200
  def create_remote_class(
82
201
  cls: Type,
83
202
  resource_config: ServerlessResource,
@@ -103,10 +222,16 @@ def create_remote_class(
103
222
  self._extra = extra
104
223
  self._constructor_args = args
105
224
  self._constructor_kwargs = kwargs
106
- self._instance_id = f"{cls.__name__}_{uuid.uuid4().hex[:8]}"
225
+ self._instance_id = (
226
+ f"{cls.__name__}_{uuid.uuid4().hex[:UUID_FALLBACK_LENGTH]}"
227
+ )
107
228
  self._initialized = False
108
229
 
109
- self._clean_class_code = extract_class_code_simple(cls)
230
+ # Generate cache key and get class code
231
+ self._cache_key = get_class_cache_key(cls, args, kwargs)
232
+ self._clean_class_code = get_or_cache_class_data(
233
+ cls, args, kwargs, self._cache_key
234
+ )
110
235
 
111
236
  log.debug(f"Created remote class wrapper for {cls.__name__}")
112
237
 
@@ -136,32 +261,45 @@ def create_remote_class(
136
261
  async def method_proxy(*args, **kwargs):
137
262
  await self._ensure_initialized()
138
263
 
139
- # Create class method request
140
-
141
- # class_code = inspect.getsource(self._class_type)
142
- class_code = self._clean_class_code
264
+ # Get cached data
265
+ cached_data = _SERIALIZED_CLASS_CACHE.get(self._cache_key)
266
+
267
+ # Serialize method arguments (these change per call, so no caching)
268
+ method_args = [
269
+ base64.b64encode(cloudpickle.dumps(arg)).decode("utf-8")
270
+ for arg in args
271
+ ]
272
+ method_kwargs = {
273
+ k: base64.b64encode(cloudpickle.dumps(v)).decode("utf-8")
274
+ for k, v in kwargs.items()
275
+ }
276
+
277
+ # Handle constructor args - use cached if available, else serialize fresh
278
+ if cached_data["constructor_args"] is not None:
279
+ # Use cached constructor args
280
+ constructor_args = cached_data["constructor_args"]
281
+ constructor_kwargs = cached_data["constructor_kwargs"]
282
+ else:
283
+ # Constructor args couldn't be cached due to serialization issues
284
+ # Serialize them fresh for each method call (fallback behavior)
285
+ constructor_args = [
286
+ base64.b64encode(cloudpickle.dumps(arg)).decode("utf-8")
287
+ for arg in self._constructor_args
288
+ ]
289
+ constructor_kwargs = {
290
+ k: base64.b64encode(cloudpickle.dumps(v)).decode("utf-8")
291
+ for k, v in self._constructor_kwargs.items()
292
+ }
143
293
 
144
294
  request = FunctionRequest(
145
295
  execution_type="class",
146
296
  class_name=self._class_type.__name__,
147
- class_code=class_code,
297
+ class_code=cached_data["class_code"],
148
298
  method_name=name,
149
- args=[
150
- base64.b64encode(cloudpickle.dumps(arg)).decode("utf-8")
151
- for arg in args
152
- ],
153
- kwargs={
154
- k: base64.b64encode(cloudpickle.dumps(v)).decode("utf-8")
155
- for k, v in kwargs.items()
156
- },
157
- constructor_args=[
158
- base64.b64encode(cloudpickle.dumps(arg)).decode("utf-8")
159
- for arg in self._constructor_args
160
- ],
161
- constructor_kwargs={
162
- k: base64.b64encode(cloudpickle.dumps(v)).decode("utf-8")
163
- for k, v in self._constructor_kwargs.items()
164
- },
299
+ args=method_args,
300
+ kwargs=method_kwargs,
301
+ constructor_args=constructor_args,
302
+ constructor_kwargs=constructor_kwargs,
165
303
  dependencies=self._dependencies,
166
304
  system_dependencies=self._system_dependencies,
167
305
  instance_id=self._instance_id,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tetra_rp
3
- Version: 0.8.0
3
+ Version: 0.10.0
4
4
  Summary: A Python library for distributed inference and serving of machine learning models
5
5
  Author-email: Marut Pandya <pandyamarut@gmail.com>, Patrick Rachford <prachford@icloud.com>, Dean Quinanola <dean.quinanola@runpod.io>
6
6
  License: MIT
@@ -11,7 +11,7 @@ Classifier: Operating System :: OS Independent
11
11
  Requires-Python: <3.14,>=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  Requires-Dist: cloudpickle>=3.1.1
14
- Requires-Dist: runpod~=1.7.9
14
+ Requires-Dist: runpod
15
15
  Requires-Dist: python-dotenv>=1.0.0
16
16
 
17
17
  # Tetra: Serverless computing for AI workloads
@@ -801,6 +801,6 @@ def fetch_data(url):
801
801
  This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
802
802
 
803
803
  <p align="center">
804
- <a href="https://github.com/yourusername/tetra">Tetra</a> •
804
+ <a href="https://github.com/runpod/tetra-rp">Tetra</a> •
805
805
  <a href="https://runpod.io">Runpod</a>
806
806
  </p>
@@ -1,16 +1,10 @@
1
1
  tetra_rp/__init__.py,sha256=-1S5sYIKtnUV8V1HlSIbX1yZwiUrsO8J5b3ZEIR_phU,687
2
2
  tetra_rp/client.py,sha256=rAMMmn4ejAayFXJMZzx7dG_8Y65tCEMI6wSSKgur4zQ,2500
3
- tetra_rp/execute_class.py,sha256=OXP1IkORELNFxOi1WHQOfUepmQGfkKmw85iZccaMEww,6515
3
+ tetra_rp/execute_class.py,sha256=HoH-qWDA7X6yGvQMwmHn5-MKxbLWHEDEHsuat5dzl2U,11912
4
4
  tetra_rp/logger.py,sha256=gk5-PWp3k_GQ5DxndsRkBCX0jarp_3lgZ1oiTFuThQg,1125
5
5
  tetra_rp/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  tetra_rp/core/api/__init__.py,sha256=oldrEKMwxYoBPLvPfVlaFS3wfUtTTxCN6-HzlpTh6vE,124
7
- tetra_rp/core/api/runpod.py,sha256=sux4q6xg2PDRKJI5kLkcW4i8UISZUOmQxsdf0g6wgpw,9711
8
- tetra_rp/core/pool/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- tetra_rp/core/pool/cluster_manager.py,sha256=KJxEp_044HjnbOhfIdiXZbks_bFDYE1KgKeR5W9VvbY,6007
10
- tetra_rp/core/pool/dataclass.py,sha256=YngS328_NTewY8Etitj4k7MmdM5GWqqE_OMbytrVNlw,338
11
- tetra_rp/core/pool/ex.py,sha256=AZOrn9t_X5ycMl-tDg7-jcIURj_9kVmzn9_da8h1TFI,1273
12
- tetra_rp/core/pool/job.py,sha256=4bisW_ZwiQ2-qD5l0y9SbHcO4EQvSKimmBBU1fpI_YE,567
13
- tetra_rp/core/pool/worker.py,sha256=N4cOnf8MiDcPFH2XSMmSnnWMACZYUNnKWVhOx2aSxvM,478
7
+ tetra_rp/core/api/runpod.py,sha256=3TTx1fkXMLZ2R5JCrQYPEn8dhdUsBt8i5OEwAfaKQ_k,10451
14
8
  tetra_rp/core/resources/__init__.py,sha256=UhIwo1Y6-tw5qsULamR296sQiztuz-oWrSTreqfmFSw,814
15
9
  tetra_rp/core/resources/base.py,sha256=UJeDiFN45aO1n5SBcxn56ohLhj-AWHoj0KO7mF4yJ_o,1440
16
10
  tetra_rp/core/resources/cloud.py,sha256=XJOWPfzYlDVJGHxgffcfpEaOKrWhGdi7AzTlaGuYj0o,70
@@ -18,15 +12,17 @@ tetra_rp/core/resources/constants.py,sha256=F1gPqFaXcCmfrbUSO9PQtUBv984TxFc3pySg
18
12
  tetra_rp/core/resources/cpu.py,sha256=YIE-tKolSU3JJzpPB7ey-PbRdqKWsJZ_Ad4h2OYaaiA,1231
19
13
  tetra_rp/core/resources/environment.py,sha256=FC9kJCa8YLSar75AKUKqJYnNLrUdjZj8ZTOrspBrS00,1267
20
14
  tetra_rp/core/resources/gpu.py,sha256=2jIIMr8PNnlIAP8ZTKO8Imx-rdxXp2rbdSHJeVfjawk,1858
21
- tetra_rp/core/resources/live_serverless.py,sha256=6r4I4TEx9AmZ0-OJvE86qrY0S7BEx9t_P2zwHVdtbew,1074
22
- tetra_rp/core/resources/network_volume.py,sha256=5_gwJlxt77VHs7T0d41l3IMZR0LhdoyQhroXCYfFF7w,3274
15
+ tetra_rp/core/resources/live_serverless.py,sha256=A3JRdCYwHR2KN_OlmTLcv-m_ObxNhBhc5CnUzXOpOtc,1177
16
+ tetra_rp/core/resources/network_volume.py,sha256=h11dRlAkkxrqyNvUP9Eb8BHAUSFQyRP4lNgBdKChezw,5391
23
17
  tetra_rp/core/resources/resource_manager.py,sha256=kUVZDblfUzaG78S8FwOzu4rN6QSegUgQNK3fJ_X7l0w,2834
24
- tetra_rp/core/resources/serverless.py,sha256=RYH-gl_edEguGOlxR669Hfi_rXII4OEaYzlB2PhzOhI,15753
18
+ tetra_rp/core/resources/serverless.py,sha256=48mENAPQrR8fMjWFpb7mpGFOMqjXZnRWGULGH7NPa5E,13629
25
19
  tetra_rp/core/resources/template.py,sha256=UkflJXZFWIbQkLuUt4oRLAjn-yIpw9_mT2X1cAH69CU,3141
26
20
  tetra_rp/core/resources/utils.py,sha256=mgXfgz_NuHN_IC7TzMNdH9II-LMjxcDCG7syDTcPiGs,1721
27
21
  tetra_rp/core/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
22
  tetra_rp/core/utils/backoff.py,sha256=1pfa0smFNpib8nztcIgBbtrVvQeECKh-aNOfL2TztgU,1324
23
+ tetra_rp/core/utils/constants.py,sha256=Dm4XiO5zTzfdqOSeYVfAjaf2LyHnIEVmbOi_s_k1J_E,375
29
24
  tetra_rp/core/utils/json.py,sha256=q0r7aEdfh8kKVeHGeh9fBDfuhHYNopSreislAMB6HhM,1163
25
+ tetra_rp/core/utils/lru_cache.py,sha256=drwKg-DfLbeBRGTzuxKqNKMQq0EuZV15LMTZIOyZuVk,2618
30
26
  tetra_rp/core/utils/singleton.py,sha256=JRli0HhBfq4P9mBUOg1TZUUwMvIenRqWdymX3qFMm2k,210
31
27
  tetra_rp/protos/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
28
  tetra_rp/protos/remote_execution.py,sha256=F4uwobnp5q-lX3lR7NCAB23J6OzlzcsB35cezwuoSnI,4638
@@ -34,7 +30,7 @@ tetra_rp/stubs/__init__.py,sha256=ozKsHs8q0T7o2qhQEquub9hqomh1Htys53mMraaRu2E,72
34
30
  tetra_rp/stubs/live_serverless.py,sha256=o1NH5XEwUD-27NXJsEGO0IwnuDp8iXwUiw5nZtaZZOI,4199
35
31
  tetra_rp/stubs/registry.py,sha256=dmbyC7uBp04_sXsG2wJCloFfFRzYjYQ-naEBKhTRo-U,2839
36
32
  tetra_rp/stubs/serverless.py,sha256=BM_a5Ml5VADBYu2WRNmo9qnicP8NnXDGl5ywifulbD0,947
37
- tetra_rp-0.8.0.dist-info/METADATA,sha256=M0qEc5SQITXYUy_FQLy_EE24CiTbJtrXjU6g5-q3fws,28055
38
- tetra_rp-0.8.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
39
- tetra_rp-0.8.0.dist-info/top_level.txt,sha256=bBay7JTDwJXsTYvVjrwno9hnF-j0q272lk65f2AcPjU,9
40
- tetra_rp-0.8.0.dist-info/RECORD,,
33
+ tetra_rp-0.10.0.dist-info/METADATA,sha256=Ck626kHGCXM6r5CHIm9P7gcg1q3IGWhB7Wiw7x0yIJs,28046
34
+ tetra_rp-0.10.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
+ tetra_rp-0.10.0.dist-info/top_level.txt,sha256=bBay7JTDwJXsTYvVjrwno9hnF-j0q272lk65f2AcPjU,9
36
+ tetra_rp-0.10.0.dist-info/RECORD,,
File without changes
@@ -1,177 +0,0 @@
1
- import time
2
- from worker import Worker
3
- from job import Job
4
-
5
- from dataclass import WorkerStatus, JobStatus
6
-
7
- import logging
8
- import inspect
9
-
10
-
11
- def setup_logging(level=logging.INFO, fmt=None):
12
- if fmt is None:
13
- fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
14
- logging.basicConfig(level=level, format=fmt)
15
-
16
-
17
- def get_logger(name=None):
18
- """
19
- Returns a logger. If no name is provided, it infers the caller's module name.
20
- """
21
- if name is None:
22
- # Get the caller's module name.
23
- frame = inspect.stack()[1]
24
- module = inspect.getmodule(frame[0])
25
- name = module.__name__ if module else "__main__"
26
- return logging.getLogger(name)
27
-
28
-
29
- logger = get_logger(__name__)
30
-
31
-
32
- class ClusterManager:
33
- """
34
- Manages workers and Jobs currently in Memory:
35
- - Runpod for provisioning
36
- - Real remote execution
37
- - Data base for the
38
- """
39
-
40
- def __init__(self):
41
- self.workers = {} # Worker ID -> Worker
42
- self.jobs = {} # Job ID -> Job
43
-
44
- # ----------------- Worker Management -----------------
45
- # ------------------------------------------------------
46
- def add_worker(self, resource_config: dict):
47
- """
48
- Add a new worker to the cluster
49
- """
50
- # here will go the logic to create a worker and add it to the cluster: RUNPOD LOGIC will be added here.
51
- worker = Worker(resource_config)
52
- self.workers[worker.worker_id] = worker
53
-
54
- logger.info(f"Added worker {worker.worker_id} to the cluster")
55
- return worker.worker_id
56
-
57
- def remove_worker(self, worker_id):
58
- """
59
- Remove a worker from the cluster
60
- """
61
- worker = self.workers.get(worker_id)
62
- if not worker:
63
- logger.error(f"Worker {worker_id} not found")
64
- return False
65
- if worker.status == WorkerStatus.RUNNING:
66
- logger.error(f"Worker {worker_id} is still running")
67
- return False
68
- del self.workers[worker_id]
69
- logger.info(f"Removed worker {worker_id} from the cluster")
70
- return True
71
-
72
- def list_workers(self):
73
- """
74
- List all workers in the cluster
75
- """
76
- return list(self.workers.values())
77
-
78
- # ----------------- Job Management -----------------
79
- # ---------------------------------------------------
80
-
81
- def submit_job(self, resource_config: dict):
82
- """
83
- Submit a new job to the cluster (Queueud). Then attempt to scheduel it.
84
- """
85
- job = Job(resource_config)
86
- self.jobs[job.job_id] = job
87
- logger.info(f"Submitted job {job.job_id} to the cluster")
88
- # attempt to schedule the job
89
- self.schedule_job(job)
90
- return job.job_id
91
-
92
- def schedule_job(self, job: Job):
93
- """
94
- find a suitable worker for the job. It none, Job remains queued.
95
- If we want to a auto provision we can actually add a logic here to add a worker if none is available.
96
- """
97
- if job.status != JobStatus.QUEUED:
98
- logger.error(f"Job {job.job_id} is not pending")
99
- return False
100
-
101
- # Find worker candidate
102
- candidate = self.find_idle_worker(job.resource_config)
103
- if candidate:
104
- self.assign_job_to_worker(job, candidate)
105
- else:
106
- logger.info(f"No worker available for job {job.job_id}")
107
- # we cn either provision new worker from here and then scehediule the job from here.
108
-
109
- def find_idle_worker(self, resource_config: dict):
110
- """
111
- Find an idle worker that can run the job
112
- """
113
- for w in self.workers.values():
114
- if w.status == WorkerStatus.IDLE:
115
- # check the resource config
116
- if w.resource_config == resource_config:
117
- continue
118
- return w
119
- return None
120
-
121
- def assign_job_to_worker(self, job: Job, worker: Worker):
122
- """
123
- Mark the job as running and the worker as Running and 'execute' the job.
124
- In a real system, we would send a remote command to the worker (eg: gRPC) to execute the job.
125
- """
126
- job.worker_id = worker.worker_id
127
- job.status = JobStatus.RUNNING
128
- worker.status = WorkerStatus.RUNNING
129
- worker.current_job_id = job.job_id
130
- logger.info(f"Assigned job {job.job_id} to worker {worker.worker_id}")
131
- self._execute_job(job, worker)
132
-
133
- def _execute_job(self, job: Job, worker: Worker):
134
- """
135
- Simulate the remote execution. right now, we jsut sleep for 1s.
136
- In production, what we we can do is:
137
- - Open a gRPC connection to the worker
138
- - pass the job details
139
- - wait for the compeltion call back
140
- """
141
- try:
142
- logger.info(f"Executing job {job.job_id} on worker {worker.worker_id}")
143
- time.sleep(
144
- 1
145
- ) # Here we can add the actual execution logic, currently it mimics the execution.
146
-
147
- # mark the job as completed
148
- job.status = JobStatus.COMPLETED
149
- job.result = "Job completed successfully"
150
- logger.info(f"[Cluster Manager] Job {job.job_id} completed successfully")
151
- except Exception as e:
152
- job.status = JobStatus.FAILED
153
- job.result = f"Job failed: {str(e)}"
154
- logger.error(f"[Cluster Manager] Job {job.job_id} failed: {str(e)}")
155
- finally:
156
- worker.status = WorkerStatus.IDLE
157
- worker.current_job_id = None
158
-
159
- def get_job_status(self, job_id):
160
- """
161
- Get the job details
162
- """
163
- job = self.jobs.get(job_id)
164
- if not job:
165
- logger.error(f"Job {job_id} not found")
166
- return None
167
- return job
168
-
169
- # this function has retry logic but it's currently fuzzy, we might have to change it.
170
-
171
- def retry_queued_jobs(self):
172
- """
173
- Retry all queued jobs
174
- """
175
- for job in self.jobs.values():
176
- if job.status == JobStatus.QUEUED:
177
- self.schedule_job(job)
@@ -1,18 +0,0 @@
1
- from enum import Enum
2
-
3
-
4
- class WorkerStatus(Enum):
5
- """Enum representing the status of a worker"""
6
-
7
- IDLE = "idle"
8
- RUNNING = "running"
9
- OFFLINE = "offline"
10
-
11
-
12
- class JobStatus(Enum):
13
- """Enum representing the status of a job"""
14
-
15
- QUEUED = "queued"
16
- RUNNING = "running"
17
- COMPLETED = "completed"
18
- FAILED = "failed"
tetra_rp/core/pool/ex.py DELETED
@@ -1,38 +0,0 @@
1
- from cluster_manager import ClusterManager
2
-
3
-
4
- if __name__ == "__main__":
5
- cm = ClusterManager()
6
-
7
- # 1) Submit a job with no existing workers (use resource_config dict)
8
- job_id = cm.submit_job(
9
- resource_config={"gpu": "H100", "memory": 16, "network_volume": 50}
10
- )
11
- print(
12
- "Job status:", cm.get_job_status(job_id)
13
- ) # should be QUEUED, no suitable worker
14
-
15
- # 2) Add a worker that doesn't match the GPU
16
- w1 = cm.add_worker(
17
- resource_config={"gpu": "H100", "memory": 16, "network_volume": 50}
18
- )
19
- # Re-try scheduling
20
- cm.retry_queued_jobs()
21
- print("Job status (still queued):", cm.get_job_status(job_id))
22
-
23
- # 3) Add a matching worker
24
- w2 = cm.add_worker(
25
- resource_config={"gpu": "H100", "memory": 16, "network_volume": 50}
26
- )
27
- # Re-try scheduling
28
- cm.retry_queued_jobs()
29
- print("Job status (should complete):", cm.get_job_status(job_id))
30
-
31
- # 4) Submit another job that requires less resources
32
- job_id2 = cm.submit_job(resource_config={"memory": 8, "network_volume": 10})
33
- # Should be assigned to w1 if it's idle
34
- print("Job2 final status:", cm.get_job_status(job_id2))
35
-
36
- # 5) Show final state of workers
37
- for worker in cm.list_workers():
38
- print("Worker:", worker)
tetra_rp/core/pool/job.py DELETED
@@ -1,22 +0,0 @@
1
- import uuid
2
- from dataclass import JobStatus
3
-
4
-
5
- class Job:
6
- """Represents a 'job' in the system
7
-
8
- In a real system, this might contain the function to run,
9
- arguments, and reference to data or code.
10
- """
11
-
12
- def __init__(self, resource_config: dict):
13
- self.job_id = str(uuid.uuid4())[:8]
14
- self.resource_config = resource_config
15
- self.status = JobStatus.QUEUED
16
-
17
- self.worker_id = None
18
- self.result = None
19
- self.error = None
20
-
21
- def __repr__(self):
22
- return f"Job(job_id={self.job_id}, status={self.status})"
@@ -1,19 +0,0 @@
1
- import uuid
2
- from dataclass import WorkerStatus
3
-
4
-
5
- class Worker:
6
- """Represents a single worker in the pool
7
-
8
- For Now we store ressources in memory
9
- """
10
-
11
- def __init__(self, resource_config: dict):
12
- self.worker_id = str(uuid.uuid4())[:8]
13
- self.resource_config = resource_config
14
- self.status = WorkerStatus.IDLE
15
-
16
- self.current_job_id = None
17
-
18
- def __repr__(self):
19
- return f"Worker(worker_id={self.worker_id}, status={self.status})"