tetra-rp 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tetra_rp/__init__.py +37 -0
- tetra_rp/client.py +59 -0
- tetra_rp/core/__init__.py +0 -0
- tetra_rp/core/api/__init__.py +5 -0
- tetra_rp/core/api/runpod.py +212 -0
- tetra_rp/core/pool/__init__.py +0 -0
- tetra_rp/core/pool/cluster_manager.py +177 -0
- tetra_rp/core/pool/dataclass.py +18 -0
- tetra_rp/core/pool/ex.py +38 -0
- tetra_rp/core/pool/job.py +22 -0
- tetra_rp/core/pool/worker.py +19 -0
- tetra_rp/core/resources/__init__.py +33 -0
- tetra_rp/core/resources/base.py +47 -0
- tetra_rp/core/resources/cloud.py +4 -0
- tetra_rp/core/resources/cpu.py +34 -0
- tetra_rp/core/resources/environment.py +41 -0
- tetra_rp/core/resources/gpu.py +53 -0
- tetra_rp/core/resources/live_serverless.py +32 -0
- tetra_rp/core/resources/resource_manager.py +80 -0
- tetra_rp/core/resources/serverless.py +476 -0
- tetra_rp/core/resources/template.py +94 -0
- tetra_rp/core/resources/utils.py +50 -0
- tetra_rp/core/utils/__init__.py +0 -0
- tetra_rp/core/utils/backoff.py +43 -0
- tetra_rp/core/utils/json.py +33 -0
- tetra_rp/core/utils/singleton.py +7 -0
- tetra_rp/logger.py +34 -0
- tetra_rp/protos/__init__.py +0 -0
- tetra_rp/protos/remote_execution.py +57 -0
- tetra_rp/stubs/__init__.py +5 -0
- tetra_rp/stubs/live_serverless.py +133 -0
- tetra_rp/stubs/registry.py +85 -0
- tetra_rp/stubs/serverless.py +30 -0
- tetra_rp-0.5.5.dist-info/METADATA +806 -0
- tetra_rp-0.5.5.dist-info/RECORD +37 -0
- tetra_rp-0.5.5.dist-info/WHEEL +5 -0
- tetra_rp-0.5.5.dist-info/top_level.txt +1 -0
tetra_rp/__init__.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Load .env vars from file
|
|
2
|
+
# before everything else
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
|
|
5
|
+
load_dotenv()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
from .logger import setup_logging # noqa: E402
|
|
9
|
+
|
|
10
|
+
setup_logging()
|
|
11
|
+
|
|
12
|
+
from .client import remote # noqa: E402
|
|
13
|
+
from .core.resources import ( # noqa: E402
|
|
14
|
+
CpuServerlessEndpoint,
|
|
15
|
+
CpuInstanceType,
|
|
16
|
+
CudaVersion,
|
|
17
|
+
GpuGroup,
|
|
18
|
+
LiveServerless,
|
|
19
|
+
PodTemplate,
|
|
20
|
+
ResourceManager,
|
|
21
|
+
ServerlessEndpoint,
|
|
22
|
+
runpod,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"remote",
|
|
28
|
+
"CpuServerlessEndpoint",
|
|
29
|
+
"CpuInstanceType",
|
|
30
|
+
"CudaVersion",
|
|
31
|
+
"GpuGroup",
|
|
32
|
+
"LiveServerless",
|
|
33
|
+
"PodTemplate",
|
|
34
|
+
"ResourceManager",
|
|
35
|
+
"ServerlessEndpoint",
|
|
36
|
+
"runpod",
|
|
37
|
+
]
|
tetra_rp/client.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from functools import wraps
|
|
3
|
+
from typing import List
|
|
4
|
+
from .core.resources import ServerlessResource, ResourceManager
|
|
5
|
+
from .stubs import stub_resource
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
log = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def remote(
|
|
12
|
+
resource_config: ServerlessResource,
|
|
13
|
+
dependencies: List[str] = None,
|
|
14
|
+
system_dependencies: List[str] = None,
|
|
15
|
+
**extra,
|
|
16
|
+
):
|
|
17
|
+
"""
|
|
18
|
+
Decorator to enable dynamic resource provisioning and dependency management for serverless functions.
|
|
19
|
+
|
|
20
|
+
This decorator allows a function to be executed in a remote serverless environment, with support for
|
|
21
|
+
dynamic resource provisioning and installation of required dependencies.
|
|
22
|
+
|
|
23
|
+
resource_config (ServerlessResource): Configuration object specifying the serverless resource
|
|
24
|
+
to be provisioned or used.
|
|
25
|
+
dependencies (List[str], optional): A list of pip package names to be installed in the remote
|
|
26
|
+
environment before executing the function. Defaults to None.
|
|
27
|
+
extra (dict, optional): Additional parameters for the execution of the resource. Defaults to an empty dict.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Callable: A decorator that wraps the target function, enabling remote execution with the
|
|
31
|
+
specified resource configuration and dependencies.
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
```python
|
|
35
|
+
@remote(
|
|
36
|
+
resource_config=my_resource_config,
|
|
37
|
+
dependencies=["numpy", "pandas"],
|
|
38
|
+
sync=True # Optional, to run synchronously
|
|
39
|
+
)
|
|
40
|
+
async def my_function(data):
|
|
41
|
+
# Function logic here
|
|
42
|
+
pass
|
|
43
|
+
```
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def decorator(func):
|
|
47
|
+
@wraps(func)
|
|
48
|
+
async def wrapper(*args, **kwargs):
|
|
49
|
+
resource_manager = ResourceManager()
|
|
50
|
+
remote_resource = await resource_manager.get_or_deploy_resource(
|
|
51
|
+
resource_config
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
stub = stub_resource(remote_resource, **extra)
|
|
55
|
+
return await stub(func, dependencies, system_dependencies, *args, **kwargs)
|
|
56
|
+
|
|
57
|
+
return wrapper
|
|
58
|
+
|
|
59
|
+
return decorator
|
|
File without changes
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Direct GraphQL communication with Runpod API.
|
|
3
|
+
Bypasses the outdated runpod-python SDK limitations.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import json
|
|
8
|
+
import aiohttp
|
|
9
|
+
from typing import Dict, Any, Optional
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
log = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
RUNPOD_API_BASE_URL = os.environ.get("RUNPOD_API_BASE_URL", "https://api.runpod.io")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RunpodGraphQLClient:
|
|
18
|
+
"""
|
|
19
|
+
Runpod GraphQL client for Runpod API.
|
|
20
|
+
Communicates directly with Runpod's GraphQL endpoint without SDK limitations.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
GRAPHQL_URL = f"{RUNPOD_API_BASE_URL}/graphql"
|
|
24
|
+
|
|
25
|
+
def __init__(self, api_key: Optional[str] = None):
|
|
26
|
+
self.api_key = api_key or os.getenv("RUNPOD_API_KEY")
|
|
27
|
+
if not self.api_key:
|
|
28
|
+
raise ValueError("Runpod API key is required")
|
|
29
|
+
|
|
30
|
+
self.session: Optional[aiohttp.ClientSession] = None
|
|
31
|
+
|
|
32
|
+
async def _get_session(self) -> aiohttp.ClientSession:
|
|
33
|
+
"""Get or create an aiohttp session."""
|
|
34
|
+
if self.session is None or self.session.closed:
|
|
35
|
+
timeout = aiohttp.ClientTimeout(total=300) # 5 minute timeout
|
|
36
|
+
self.session = aiohttp.ClientSession(
|
|
37
|
+
timeout=timeout,
|
|
38
|
+
headers={
|
|
39
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
40
|
+
"Content-Type": "application/json",
|
|
41
|
+
},
|
|
42
|
+
)
|
|
43
|
+
return self.session
|
|
44
|
+
|
|
45
|
+
async def _execute_graphql(
|
|
46
|
+
self, query: str, variables: Optional[Dict[str, Any]] = None
|
|
47
|
+
) -> Dict[str, Any]:
|
|
48
|
+
"""Execute a GraphQL query/mutation."""
|
|
49
|
+
session = await self._get_session()
|
|
50
|
+
|
|
51
|
+
payload = {"query": query, "variables": variables or {}}
|
|
52
|
+
|
|
53
|
+
log.debug(f"GraphQL Query: {query}")
|
|
54
|
+
log.debug(f"GraphQL Variables: {json.dumps(variables, indent=2)}")
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
async with session.post(self.GRAPHQL_URL, json=payload) as response:
|
|
58
|
+
response_data = await response.json()
|
|
59
|
+
|
|
60
|
+
log.debug(f"GraphQL Response Status: {response.status}")
|
|
61
|
+
log.debug(f"GraphQL Response: {json.dumps(response_data, indent=2)}")
|
|
62
|
+
|
|
63
|
+
if response.status >= 400:
|
|
64
|
+
raise Exception(
|
|
65
|
+
f"GraphQL request failed: {response.status} - {response_data}"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if "errors" in response_data:
|
|
69
|
+
errors = response_data["errors"]
|
|
70
|
+
error_msg = "; ".join([e.get("message", str(e)) for e in errors])
|
|
71
|
+
raise Exception(f"GraphQL errors: {error_msg}")
|
|
72
|
+
|
|
73
|
+
return response_data.get("data", {})
|
|
74
|
+
|
|
75
|
+
except aiohttp.ClientError as e:
|
|
76
|
+
log.error(f"HTTP client error: {e}")
|
|
77
|
+
raise Exception(f"HTTP request failed: {e}")
|
|
78
|
+
|
|
79
|
+
async def create_endpoint(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
80
|
+
"""
|
|
81
|
+
Create a serverless endpoint using direct GraphQL mutation.
|
|
82
|
+
Supports both GPU and CPU endpoints with full field support.
|
|
83
|
+
"""
|
|
84
|
+
# GraphQL mutation for saveEndpoint (based on actual schema)
|
|
85
|
+
mutation = """
|
|
86
|
+
mutation saveEndpoint($input: EndpointInput!) {
|
|
87
|
+
saveEndpoint(input: $input) {
|
|
88
|
+
aiKey
|
|
89
|
+
gpuIds
|
|
90
|
+
id
|
|
91
|
+
idleTimeout
|
|
92
|
+
locations
|
|
93
|
+
name
|
|
94
|
+
networkVolumeId
|
|
95
|
+
scalerType
|
|
96
|
+
scalerValue
|
|
97
|
+
templateId
|
|
98
|
+
type
|
|
99
|
+
userId
|
|
100
|
+
version
|
|
101
|
+
workersMax
|
|
102
|
+
workersMin
|
|
103
|
+
workersStandby
|
|
104
|
+
workersPFBTarget
|
|
105
|
+
gpuCount
|
|
106
|
+
allowedCudaVersions
|
|
107
|
+
executionTimeoutMs
|
|
108
|
+
instanceIds
|
|
109
|
+
activeBuildid
|
|
110
|
+
idePodId
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
variables = {"input": input_data}
|
|
116
|
+
|
|
117
|
+
log.debug(
|
|
118
|
+
f"Creating endpoint with GraphQL: {input_data.get('name', 'unnamed')}"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
result = await self._execute_graphql(mutation, variables)
|
|
122
|
+
|
|
123
|
+
if "saveEndpoint" not in result:
|
|
124
|
+
raise Exception("Unexpected GraphQL response structure")
|
|
125
|
+
|
|
126
|
+
endpoint_data = result["saveEndpoint"]
|
|
127
|
+
log.info(
|
|
128
|
+
f"Created endpoint: {endpoint_data.get('id', 'unknown')} - {endpoint_data.get('name', 'unnamed')}"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return endpoint_data
|
|
132
|
+
|
|
133
|
+
async def get_cpu_types(self) -> Dict[str, Any]:
|
|
134
|
+
"""Get available CPU types."""
|
|
135
|
+
query = """
|
|
136
|
+
query getCpuTypes {
|
|
137
|
+
cpuTypes {
|
|
138
|
+
id
|
|
139
|
+
displayName
|
|
140
|
+
manufacturer
|
|
141
|
+
cores
|
|
142
|
+
threadsPerCore
|
|
143
|
+
groupId
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
result = await self._execute_graphql(query)
|
|
149
|
+
return result.get("cpuTypes", [])
|
|
150
|
+
|
|
151
|
+
async def get_gpu_types(
|
|
152
|
+
self, gpu_filter: Optional[Dict[str, Any]] = None
|
|
153
|
+
) -> Dict[str, Any]:
|
|
154
|
+
"""Get available GPU types."""
|
|
155
|
+
query = """
|
|
156
|
+
query getGpuTypes($input: GpuTypeFilter) {
|
|
157
|
+
gpuTypes(input: $input) {
|
|
158
|
+
id
|
|
159
|
+
displayName
|
|
160
|
+
manufacturer
|
|
161
|
+
memoryInGb
|
|
162
|
+
cudaCores
|
|
163
|
+
secureCloud
|
|
164
|
+
communityCloud
|
|
165
|
+
securePrice
|
|
166
|
+
communityPrice
|
|
167
|
+
communitySpotPrice
|
|
168
|
+
secureSpotPrice
|
|
169
|
+
maxGpuCount
|
|
170
|
+
maxGpuCountCommunityCloud
|
|
171
|
+
maxGpuCountSecureCloud
|
|
172
|
+
minPodGpuCount
|
|
173
|
+
nodeGroupGpuSizes
|
|
174
|
+
throughput
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
variables = {"input": gpu_filter} if gpu_filter else {}
|
|
180
|
+
result = await self._execute_graphql(query, variables)
|
|
181
|
+
return result.get("gpuTypes", [])
|
|
182
|
+
|
|
183
|
+
async def get_endpoint(self, endpoint_id: str) -> Dict[str, Any]:
|
|
184
|
+
"""Get endpoint details."""
|
|
185
|
+
# Note: The schema doesn't show a specific endpoint query
|
|
186
|
+
# This would need to be implemented if such query exists
|
|
187
|
+
raise NotImplementedError("Get endpoint query not available in current schema")
|
|
188
|
+
|
|
189
|
+
async def delete_endpoint(self, endpoint_id: str) -> Dict[str, Any]:
|
|
190
|
+
"""Delete a serverless endpoint."""
|
|
191
|
+
mutation = """
|
|
192
|
+
mutation deleteEndpoint($id: String!) {
|
|
193
|
+
deleteEndpoint(id: $id)
|
|
194
|
+
}
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
variables = {"id": endpoint_id}
|
|
198
|
+
log.info(f"Deleting endpoint: {endpoint_id}")
|
|
199
|
+
|
|
200
|
+
result = await self._execute_graphql(mutation, variables)
|
|
201
|
+
return {"success": result.get("deleteEndpoint") is not None}
|
|
202
|
+
|
|
203
|
+
async def close(self):
|
|
204
|
+
"""Close the HTTP session."""
|
|
205
|
+
if self.session and not self.session.closed:
|
|
206
|
+
await self.session.close()
|
|
207
|
+
|
|
208
|
+
async def __aenter__(self):
|
|
209
|
+
return self
|
|
210
|
+
|
|
211
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
212
|
+
await self.close()
|
|
File without changes
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from worker import Worker
|
|
3
|
+
from job import Job
|
|
4
|
+
|
|
5
|
+
from dataclass import WorkerStatus, JobStatus
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import inspect
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def setup_logging(level=logging.INFO, fmt=None):
|
|
12
|
+
if fmt is None:
|
|
13
|
+
fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
14
|
+
logging.basicConfig(level=level, format=fmt)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_logger(name=None):
|
|
18
|
+
"""
|
|
19
|
+
Returns a logger. If no name is provided, it infers the caller's module name.
|
|
20
|
+
"""
|
|
21
|
+
if name is None:
|
|
22
|
+
# Get the caller's module name.
|
|
23
|
+
frame = inspect.stack()[1]
|
|
24
|
+
module = inspect.getmodule(frame[0])
|
|
25
|
+
name = module.__name__ if module else "__main__"
|
|
26
|
+
return logging.getLogger(name)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
logger = get_logger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ClusterManager:
|
|
33
|
+
"""
|
|
34
|
+
Manages workers and Jobs currently in Memory:
|
|
35
|
+
- Runpod for provisioning
|
|
36
|
+
- Real remote execution
|
|
37
|
+
- Data base for the
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self):
|
|
41
|
+
self.workers = {} # Worker ID -> Worker
|
|
42
|
+
self.jobs = {} # Job ID -> Job
|
|
43
|
+
|
|
44
|
+
# ----------------- Worker Management -----------------
|
|
45
|
+
# ------------------------------------------------------
|
|
46
|
+
def add_worker(self, resource_config: dict):
|
|
47
|
+
"""
|
|
48
|
+
Add a new worker to the cluster
|
|
49
|
+
"""
|
|
50
|
+
# here will go the logic to create a worker and add it to the cluster: RUNPOD LOGIC will be added here.
|
|
51
|
+
worker = Worker(resource_config)
|
|
52
|
+
self.workers[worker.worker_id] = worker
|
|
53
|
+
|
|
54
|
+
logger.info(f"Added worker {worker.worker_id} to the cluster")
|
|
55
|
+
return worker.worker_id
|
|
56
|
+
|
|
57
|
+
def remove_worker(self, worker_id):
|
|
58
|
+
"""
|
|
59
|
+
Remove a worker from the cluster
|
|
60
|
+
"""
|
|
61
|
+
worker = self.workers.get(worker_id)
|
|
62
|
+
if not worker:
|
|
63
|
+
logger.error(f"Worker {worker_id} not found")
|
|
64
|
+
return False
|
|
65
|
+
if worker.status == WorkerStatus.RUNNING:
|
|
66
|
+
logger.error(f"Worker {worker_id} is still running")
|
|
67
|
+
return False
|
|
68
|
+
del self.workers[worker_id]
|
|
69
|
+
logger.info(f"Removed worker {worker_id} from the cluster")
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
def list_workers(self):
|
|
73
|
+
"""
|
|
74
|
+
List all workers in the cluster
|
|
75
|
+
"""
|
|
76
|
+
return list(self.workers.values())
|
|
77
|
+
|
|
78
|
+
# ----------------- Job Management -----------------
|
|
79
|
+
# ---------------------------------------------------
|
|
80
|
+
|
|
81
|
+
def submit_job(self, resource_config: dict):
|
|
82
|
+
"""
|
|
83
|
+
Submit a new job to the cluster (Queueud). Then attempt to scheduel it.
|
|
84
|
+
"""
|
|
85
|
+
job = Job(resource_config)
|
|
86
|
+
self.jobs[job.job_id] = job
|
|
87
|
+
logger.info(f"Submitted job {job.job_id} to the cluster")
|
|
88
|
+
# attempt to schedule the job
|
|
89
|
+
self.schedule_job(job)
|
|
90
|
+
return job.job_id
|
|
91
|
+
|
|
92
|
+
def schedule_job(self, job: Job):
|
|
93
|
+
"""
|
|
94
|
+
find a suitable worker for the job. It none, Job remains queued.
|
|
95
|
+
If we want to a auto provision we can actually add a logic here to add a worker if none is available.
|
|
96
|
+
"""
|
|
97
|
+
if job.status != JobStatus.QUEUED:
|
|
98
|
+
logger.error(f"Job {job.job_id} is not pending")
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
# Find worker candidate
|
|
102
|
+
candidate = self.find_idle_worker(job.resource_config)
|
|
103
|
+
if candidate:
|
|
104
|
+
self.assign_job_to_worker(job, candidate)
|
|
105
|
+
else:
|
|
106
|
+
logger.info(f"No worker available for job {job.job_id}")
|
|
107
|
+
# we cn either provision new worker from here and then scehediule the job from here.
|
|
108
|
+
|
|
109
|
+
def find_idle_worker(self, resource_config: dict):
|
|
110
|
+
"""
|
|
111
|
+
Find an idle worker that can run the job
|
|
112
|
+
"""
|
|
113
|
+
for w in self.workers.values():
|
|
114
|
+
if w.status == WorkerStatus.IDLE:
|
|
115
|
+
# check the resource config
|
|
116
|
+
if w.resource_config == resource_config:
|
|
117
|
+
continue
|
|
118
|
+
return w
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
def assign_job_to_worker(self, job: Job, worker: Worker):
|
|
122
|
+
"""
|
|
123
|
+
Mark the job as running and the worker as Running and 'execute' the job.
|
|
124
|
+
In a real system, we would send a remote command to the worker (eg: gRPC) to execute the job.
|
|
125
|
+
"""
|
|
126
|
+
job.worker_id = worker.worker_id
|
|
127
|
+
job.status = JobStatus.RUNNING
|
|
128
|
+
worker.status = WorkerStatus.RUNNING
|
|
129
|
+
worker.current_job_id = job.job_id
|
|
130
|
+
logger.info(f"Assigned job {job.job_id} to worker {worker.worker_id}")
|
|
131
|
+
self._execute_job(job, worker)
|
|
132
|
+
|
|
133
|
+
def _execute_job(self, job: Job, worker: Worker):
|
|
134
|
+
"""
|
|
135
|
+
Simulate the remote execution. right now, we jsut sleep for 1s.
|
|
136
|
+
In production, what we we can do is:
|
|
137
|
+
- Open a gRPC connection to the worker
|
|
138
|
+
- pass the job details
|
|
139
|
+
- wait for the compeltion call back
|
|
140
|
+
"""
|
|
141
|
+
try:
|
|
142
|
+
logger.info(f"Executing job {job.job_id} on worker {worker.worker_id}")
|
|
143
|
+
time.sleep(
|
|
144
|
+
1
|
|
145
|
+
) # Here we can add the actual execution logic, currently it mimics the execution.
|
|
146
|
+
|
|
147
|
+
# mark the job as completed
|
|
148
|
+
job.status = JobStatus.COMPLETED
|
|
149
|
+
job.result = "Job completed successfully"
|
|
150
|
+
logger.info(f"[Cluster Manager] Job {job.job_id} completed successfully")
|
|
151
|
+
except Exception as e:
|
|
152
|
+
job.status = JobStatus.FAILED
|
|
153
|
+
job.result = f"Job failed: {str(e)}"
|
|
154
|
+
logger.error(f"[Cluster Manager] Job {job.job_id} failed: {str(e)}")
|
|
155
|
+
finally:
|
|
156
|
+
worker.status = WorkerStatus.IDLE
|
|
157
|
+
worker.current_job_id = None
|
|
158
|
+
|
|
159
|
+
def get_job_status(self, job_id):
|
|
160
|
+
"""
|
|
161
|
+
Get the job details
|
|
162
|
+
"""
|
|
163
|
+
job = self.jobs.get(job_id)
|
|
164
|
+
if not job:
|
|
165
|
+
logger.error(f"Job {job_id} not found")
|
|
166
|
+
return None
|
|
167
|
+
return job
|
|
168
|
+
|
|
169
|
+
# this function has retry logic but it's currently fuzzy, we might have to change it.
|
|
170
|
+
|
|
171
|
+
def retry_queued_jobs(self):
|
|
172
|
+
"""
|
|
173
|
+
Retry all queued jobs
|
|
174
|
+
"""
|
|
175
|
+
for job in self.jobs.values():
|
|
176
|
+
if job.status == JobStatus.QUEUED:
|
|
177
|
+
self.schedule_job(job)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class WorkerStatus(Enum):
|
|
5
|
+
"""Enum representing the status of a worker"""
|
|
6
|
+
|
|
7
|
+
IDLE = "idle"
|
|
8
|
+
RUNNING = "running"
|
|
9
|
+
OFFLINE = "offline"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class JobStatus(Enum):
|
|
13
|
+
"""Enum representing the status of a job"""
|
|
14
|
+
|
|
15
|
+
QUEUED = "queued"
|
|
16
|
+
RUNNING = "running"
|
|
17
|
+
COMPLETED = "completed"
|
|
18
|
+
FAILED = "failed"
|
tetra_rp/core/pool/ex.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from cluster_manager import ClusterManager
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
if __name__ == "__main__":
|
|
5
|
+
cm = ClusterManager()
|
|
6
|
+
|
|
7
|
+
# 1) Submit a job with no existing workers (use resource_config dict)
|
|
8
|
+
job_id = cm.submit_job(
|
|
9
|
+
resource_config={"gpu": "H100", "memory": 16, "network_volume": 50}
|
|
10
|
+
)
|
|
11
|
+
print(
|
|
12
|
+
"Job status:", cm.get_job_status(job_id)
|
|
13
|
+
) # should be QUEUED, no suitable worker
|
|
14
|
+
|
|
15
|
+
# 2) Add a worker that doesn't match the GPU
|
|
16
|
+
w1 = cm.add_worker(
|
|
17
|
+
resource_config={"gpu": "H100", "memory": 16, "network_volume": 50}
|
|
18
|
+
)
|
|
19
|
+
# Re-try scheduling
|
|
20
|
+
cm.retry_queued_jobs()
|
|
21
|
+
print("Job status (still queued):", cm.get_job_status(job_id))
|
|
22
|
+
|
|
23
|
+
# 3) Add a matching worker
|
|
24
|
+
w2 = cm.add_worker(
|
|
25
|
+
resource_config={"gpu": "H100", "memory": 16, "network_volume": 50}
|
|
26
|
+
)
|
|
27
|
+
# Re-try scheduling
|
|
28
|
+
cm.retry_queued_jobs()
|
|
29
|
+
print("Job status (should complete):", cm.get_job_status(job_id))
|
|
30
|
+
|
|
31
|
+
# 4) Submit another job that requires less resources
|
|
32
|
+
job_id2 = cm.submit_job(resource_config={"memory": 8, "network_volume": 10})
|
|
33
|
+
# Should be assigned to w1 if it's idle
|
|
34
|
+
print("Job2 final status:", cm.get_job_status(job_id2))
|
|
35
|
+
|
|
36
|
+
# 5) Show final state of workers
|
|
37
|
+
for worker in cm.list_workers():
|
|
38
|
+
print("Worker:", worker)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from dataclass import JobStatus
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Job:
|
|
6
|
+
"""Represents a 'job' in the system
|
|
7
|
+
|
|
8
|
+
In a real system, this might contain the function to run,
|
|
9
|
+
arguments, and reference to data or code.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, resource_config: dict):
|
|
13
|
+
self.job_id = str(uuid.uuid4())[:8]
|
|
14
|
+
self.resource_config = resource_config
|
|
15
|
+
self.status = JobStatus.QUEUED
|
|
16
|
+
|
|
17
|
+
self.worker_id = None
|
|
18
|
+
self.result = None
|
|
19
|
+
self.error = None
|
|
20
|
+
|
|
21
|
+
def __repr__(self):
|
|
22
|
+
return f"Job(job_id={self.job_id}, status={self.status})"
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from dataclass import WorkerStatus
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Worker:
|
|
6
|
+
"""Represents a single worker in the pool
|
|
7
|
+
|
|
8
|
+
For Now we store ressources in memory
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, resource_config: dict):
|
|
12
|
+
self.worker_id = str(uuid.uuid4())[:8]
|
|
13
|
+
self.resource_config = resource_config
|
|
14
|
+
self.status = WorkerStatus.IDLE
|
|
15
|
+
|
|
16
|
+
self.current_job_id = None
|
|
17
|
+
|
|
18
|
+
def __repr__(self):
|
|
19
|
+
return f"Worker(worker_id={self.worker_id}, status={self.status})"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from .base import BaseResource, DeployableResource
|
|
2
|
+
from .cloud import runpod
|
|
3
|
+
from .cpu import CpuInstanceType
|
|
4
|
+
from .gpu import GpuGroup, GpuType, GpuTypeDetail
|
|
5
|
+
from .resource_manager import ResourceManager
|
|
6
|
+
from .live_serverless import LiveServerless
|
|
7
|
+
from .serverless import (
|
|
8
|
+
CpuServerlessEndpoint,
|
|
9
|
+
ServerlessResource,
|
|
10
|
+
ServerlessEndpoint,
|
|
11
|
+
JobOutput,
|
|
12
|
+
CudaVersion,
|
|
13
|
+
)
|
|
14
|
+
from .template import PodTemplate
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"runpod",
|
|
19
|
+
"BaseResource",
|
|
20
|
+
"CpuInstanceType",
|
|
21
|
+
"CpuServerlessEndpoint",
|
|
22
|
+
"CudaVersion",
|
|
23
|
+
"DeployableResource",
|
|
24
|
+
"GpuGroup",
|
|
25
|
+
"GpuType",
|
|
26
|
+
"GpuTypeDetail",
|
|
27
|
+
"JobOutput",
|
|
28
|
+
"LiveServerless",
|
|
29
|
+
"ResourceManager",
|
|
30
|
+
"ServerlessResource",
|
|
31
|
+
"ServerlessEndpoint",
|
|
32
|
+
"PodTemplate",
|
|
33
|
+
]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from pydantic import BaseModel, ConfigDict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BaseResource(BaseModel):
|
|
8
|
+
"""Base class for all resources."""
|
|
9
|
+
|
|
10
|
+
model_config = ConfigDict(
|
|
11
|
+
validate_by_name=True,
|
|
12
|
+
validate_default=True,
|
|
13
|
+
serialize_by_alias=True,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
id: Optional[str] = None
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def resource_id(self) -> str:
|
|
20
|
+
"""Unique resource ID based on configuration."""
|
|
21
|
+
resource_type = self.__class__.__name__
|
|
22
|
+
config_str = self.model_dump_json(exclude_none=True)
|
|
23
|
+
hash_obj = hashlib.md5(f"{resource_type}:{config_str}".encode())
|
|
24
|
+
return f"{resource_type}_{hash_obj.hexdigest()}"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DeployableResource(BaseResource, ABC):
|
|
28
|
+
"""Base class for deployable resources."""
|
|
29
|
+
|
|
30
|
+
def __str__(self) -> str:
|
|
31
|
+
return f"{self.__class__.__name__}"
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def url(self) -> str:
|
|
36
|
+
"""Public URL of the resource."""
|
|
37
|
+
raise NotImplementedError("Subclasses should implement this method.")
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def is_deployed(self) -> bool:
|
|
41
|
+
"""Check the resource if it's still valid or available."""
|
|
42
|
+
raise NotImplementedError("Subclasses should implement this method.")
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
async def deploy(self) -> "DeployableResource":
|
|
46
|
+
"""Deploy the resource."""
|
|
47
|
+
raise NotImplementedError("Subclasses should implement this method.")
|