tetra-rp 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tetra_rp/__init__.py ADDED
@@ -0,0 +1,37 @@
1
+ # Load .env vars from file
2
+ # before everything else
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+
8
+ from .logger import setup_logging # noqa: E402
9
+
10
+ setup_logging()
11
+
12
+ from .client import remote # noqa: E402
13
+ from .core.resources import ( # noqa: E402
14
+ CpuServerlessEndpoint,
15
+ CpuInstanceType,
16
+ CudaVersion,
17
+ GpuGroup,
18
+ LiveServerless,
19
+ PodTemplate,
20
+ ResourceManager,
21
+ ServerlessEndpoint,
22
+ runpod,
23
+ )
24
+
25
+
26
+ __all__ = [
27
+ "remote",
28
+ "CpuServerlessEndpoint",
29
+ "CpuInstanceType",
30
+ "CudaVersion",
31
+ "GpuGroup",
32
+ "LiveServerless",
33
+ "PodTemplate",
34
+ "ResourceManager",
35
+ "ServerlessEndpoint",
36
+ "runpod",
37
+ ]
tetra_rp/client.py ADDED
@@ -0,0 +1,59 @@
1
+ import logging
2
+ from functools import wraps
3
+ from typing import List
4
+ from .core.resources import ServerlessResource, ResourceManager
5
+ from .stubs import stub_resource
6
+
7
+
8
+ log = logging.getLogger(__name__)
9
+
10
+
11
+ def remote(
12
+ resource_config: ServerlessResource,
13
+ dependencies: List[str] = None,
14
+ system_dependencies: List[str] = None,
15
+ **extra,
16
+ ):
17
+ """
18
+ Decorator to enable dynamic resource provisioning and dependency management for serverless functions.
19
+
20
+ This decorator allows a function to be executed in a remote serverless environment, with support for
21
+ dynamic resource provisioning and installation of required dependencies.
22
+
23
+ resource_config (ServerlessResource): Configuration object specifying the serverless resource
24
+ to be provisioned or used.
25
+ dependencies (List[str], optional): A list of pip package names to be installed in the remote
26
+ environment before executing the function. Defaults to None.
27
+ extra (dict, optional): Additional parameters for the execution of the resource. Defaults to an empty dict.
28
+
29
+ Returns:
30
+ Callable: A decorator that wraps the target function, enabling remote execution with the
31
+ specified resource configuration and dependencies.
32
+
33
+ Example:
34
+ ```python
35
+ @remote(
36
+ resource_config=my_resource_config,
37
+ dependencies=["numpy", "pandas"],
38
+ sync=True # Optional, to run synchronously
39
+ )
40
+ async def my_function(data):
41
+ # Function logic here
42
+ pass
43
+ ```
44
+ """
45
+
46
+ def decorator(func):
47
+ @wraps(func)
48
+ async def wrapper(*args, **kwargs):
49
+ resource_manager = ResourceManager()
50
+ remote_resource = await resource_manager.get_or_deploy_resource(
51
+ resource_config
52
+ )
53
+
54
+ stub = stub_resource(remote_resource, **extra)
55
+ return await stub(func, dependencies, system_dependencies, *args, **kwargs)
56
+
57
+ return wrapper
58
+
59
+ return decorator
File without changes
@@ -0,0 +1,5 @@
1
+ from .runpod import RunpodGraphQLClient
2
+
3
+ __all__ = [
4
+ "RunpodGraphQLClient",
5
+ ]
@@ -0,0 +1,212 @@
1
+ """
2
+ Direct GraphQL communication with Runpod API.
3
+ Bypasses the outdated runpod-python SDK limitations.
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import aiohttp
9
+ from typing import Dict, Any, Optional
10
+ import logging
11
+
12
+ log = logging.getLogger(__name__)
13
+
14
+ RUNPOD_API_BASE_URL = os.environ.get("RUNPOD_API_BASE_URL", "https://api.runpod.io")
15
+
16
+
17
+ class RunpodGraphQLClient:
18
+ """
19
+ Runpod GraphQL client for Runpod API.
20
+ Communicates directly with Runpod's GraphQL endpoint without SDK limitations.
21
+ """
22
+
23
+ GRAPHQL_URL = f"{RUNPOD_API_BASE_URL}/graphql"
24
+
25
+ def __init__(self, api_key: Optional[str] = None):
26
+ self.api_key = api_key or os.getenv("RUNPOD_API_KEY")
27
+ if not self.api_key:
28
+ raise ValueError("Runpod API key is required")
29
+
30
+ self.session: Optional[aiohttp.ClientSession] = None
31
+
32
+ async def _get_session(self) -> aiohttp.ClientSession:
33
+ """Get or create an aiohttp session."""
34
+ if self.session is None or self.session.closed:
35
+ timeout = aiohttp.ClientTimeout(total=300) # 5 minute timeout
36
+ self.session = aiohttp.ClientSession(
37
+ timeout=timeout,
38
+ headers={
39
+ "Authorization": f"Bearer {self.api_key}",
40
+ "Content-Type": "application/json",
41
+ },
42
+ )
43
+ return self.session
44
+
45
+ async def _execute_graphql(
46
+ self, query: str, variables: Optional[Dict[str, Any]] = None
47
+ ) -> Dict[str, Any]:
48
+ """Execute a GraphQL query/mutation."""
49
+ session = await self._get_session()
50
+
51
+ payload = {"query": query, "variables": variables or {}}
52
+
53
+ log.debug(f"GraphQL Query: {query}")
54
+ log.debug(f"GraphQL Variables: {json.dumps(variables, indent=2)}")
55
+
56
+ try:
57
+ async with session.post(self.GRAPHQL_URL, json=payload) as response:
58
+ response_data = await response.json()
59
+
60
+ log.debug(f"GraphQL Response Status: {response.status}")
61
+ log.debug(f"GraphQL Response: {json.dumps(response_data, indent=2)}")
62
+
63
+ if response.status >= 400:
64
+ raise Exception(
65
+ f"GraphQL request failed: {response.status} - {response_data}"
66
+ )
67
+
68
+ if "errors" in response_data:
69
+ errors = response_data["errors"]
70
+ error_msg = "; ".join([e.get("message", str(e)) for e in errors])
71
+ raise Exception(f"GraphQL errors: {error_msg}")
72
+
73
+ return response_data.get("data", {})
74
+
75
+ except aiohttp.ClientError as e:
76
+ log.error(f"HTTP client error: {e}")
77
+ raise Exception(f"HTTP request failed: {e}")
78
+
79
+ async def create_endpoint(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
80
+ """
81
+ Create a serverless endpoint using direct GraphQL mutation.
82
+ Supports both GPU and CPU endpoints with full field support.
83
+ """
84
+ # GraphQL mutation for saveEndpoint (based on actual schema)
85
+ mutation = """
86
+ mutation saveEndpoint($input: EndpointInput!) {
87
+ saveEndpoint(input: $input) {
88
+ aiKey
89
+ gpuIds
90
+ id
91
+ idleTimeout
92
+ locations
93
+ name
94
+ networkVolumeId
95
+ scalerType
96
+ scalerValue
97
+ templateId
98
+ type
99
+ userId
100
+ version
101
+ workersMax
102
+ workersMin
103
+ workersStandby
104
+ workersPFBTarget
105
+ gpuCount
106
+ allowedCudaVersions
107
+ executionTimeoutMs
108
+ instanceIds
109
+ activeBuildid
110
+ idePodId
111
+ }
112
+ }
113
+ """
114
+
115
+ variables = {"input": input_data}
116
+
117
+ log.debug(
118
+ f"Creating endpoint with GraphQL: {input_data.get('name', 'unnamed')}"
119
+ )
120
+
121
+ result = await self._execute_graphql(mutation, variables)
122
+
123
+ if "saveEndpoint" not in result:
124
+ raise Exception("Unexpected GraphQL response structure")
125
+
126
+ endpoint_data = result["saveEndpoint"]
127
+ log.info(
128
+ f"Created endpoint: {endpoint_data.get('id', 'unknown')} - {endpoint_data.get('name', 'unnamed')}"
129
+ )
130
+
131
+ return endpoint_data
132
+
133
+ async def get_cpu_types(self) -> Dict[str, Any]:
134
+ """Get available CPU types."""
135
+ query = """
136
+ query getCpuTypes {
137
+ cpuTypes {
138
+ id
139
+ displayName
140
+ manufacturer
141
+ cores
142
+ threadsPerCore
143
+ groupId
144
+ }
145
+ }
146
+ """
147
+
148
+ result = await self._execute_graphql(query)
149
+ return result.get("cpuTypes", [])
150
+
151
+ async def get_gpu_types(
152
+ self, gpu_filter: Optional[Dict[str, Any]] = None
153
+ ) -> Dict[str, Any]:
154
+ """Get available GPU types."""
155
+ query = """
156
+ query getGpuTypes($input: GpuTypeFilter) {
157
+ gpuTypes(input: $input) {
158
+ id
159
+ displayName
160
+ manufacturer
161
+ memoryInGb
162
+ cudaCores
163
+ secureCloud
164
+ communityCloud
165
+ securePrice
166
+ communityPrice
167
+ communitySpotPrice
168
+ secureSpotPrice
169
+ maxGpuCount
170
+ maxGpuCountCommunityCloud
171
+ maxGpuCountSecureCloud
172
+ minPodGpuCount
173
+ nodeGroupGpuSizes
174
+ throughput
175
+ }
176
+ }
177
+ """
178
+
179
+ variables = {"input": gpu_filter} if gpu_filter else {}
180
+ result = await self._execute_graphql(query, variables)
181
+ return result.get("gpuTypes", [])
182
+
183
+ async def get_endpoint(self, endpoint_id: str) -> Dict[str, Any]:
184
+ """Get endpoint details."""
185
+ # Note: The schema doesn't show a specific endpoint query
186
+ # This would need to be implemented if such query exists
187
+ raise NotImplementedError("Get endpoint query not available in current schema")
188
+
189
+ async def delete_endpoint(self, endpoint_id: str) -> Dict[str, Any]:
190
+ """Delete a serverless endpoint."""
191
+ mutation = """
192
+ mutation deleteEndpoint($id: String!) {
193
+ deleteEndpoint(id: $id)
194
+ }
195
+ """
196
+
197
+ variables = {"id": endpoint_id}
198
+ log.info(f"Deleting endpoint: {endpoint_id}")
199
+
200
+ result = await self._execute_graphql(mutation, variables)
201
+ return {"success": result.get("deleteEndpoint") is not None}
202
+
203
+ async def close(self):
204
+ """Close the HTTP session."""
205
+ if self.session and not self.session.closed:
206
+ await self.session.close()
207
+
208
+ async def __aenter__(self):
209
+ return self
210
+
211
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
212
+ await self.close()
File without changes
@@ -0,0 +1,177 @@
1
+ import time
2
+ from worker import Worker
3
+ from job import Job
4
+
5
+ from dataclass import WorkerStatus, JobStatus
6
+
7
+ import logging
8
+ import inspect
9
+
10
+
11
+ def setup_logging(level=logging.INFO, fmt=None):
12
+ if fmt is None:
13
+ fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
14
+ logging.basicConfig(level=level, format=fmt)
15
+
16
+
17
+ def get_logger(name=None):
18
+ """
19
+ Returns a logger. If no name is provided, it infers the caller's module name.
20
+ """
21
+ if name is None:
22
+ # Get the caller's module name.
23
+ frame = inspect.stack()[1]
24
+ module = inspect.getmodule(frame[0])
25
+ name = module.__name__ if module else "__main__"
26
+ return logging.getLogger(name)
27
+
28
+
29
+ logger = get_logger(__name__)
30
+
31
+
32
+ class ClusterManager:
33
+ """
34
+ Manages workers and Jobs currently in Memory:
35
+ - Runpod for provisioning
36
+ - Real remote execution
37
+ - Data base for the
38
+ """
39
+
40
+ def __init__(self):
41
+ self.workers = {} # Worker ID -> Worker
42
+ self.jobs = {} # Job ID -> Job
43
+
44
+ # ----------------- Worker Management -----------------
45
+ # ------------------------------------------------------
46
+ def add_worker(self, resource_config: dict):
47
+ """
48
+ Add a new worker to the cluster
49
+ """
50
+ # here will go the logic to create a worker and add it to the cluster: RUNPOD LOGIC will be added here.
51
+ worker = Worker(resource_config)
52
+ self.workers[worker.worker_id] = worker
53
+
54
+ logger.info(f"Added worker {worker.worker_id} to the cluster")
55
+ return worker.worker_id
56
+
57
+ def remove_worker(self, worker_id):
58
+ """
59
+ Remove a worker from the cluster
60
+ """
61
+ worker = self.workers.get(worker_id)
62
+ if not worker:
63
+ logger.error(f"Worker {worker_id} not found")
64
+ return False
65
+ if worker.status == WorkerStatus.RUNNING:
66
+ logger.error(f"Worker {worker_id} is still running")
67
+ return False
68
+ del self.workers[worker_id]
69
+ logger.info(f"Removed worker {worker_id} from the cluster")
70
+ return True
71
+
72
+ def list_workers(self):
73
+ """
74
+ List all workers in the cluster
75
+ """
76
+ return list(self.workers.values())
77
+
78
+ # ----------------- Job Management -----------------
79
+ # ---------------------------------------------------
80
+
81
+ def submit_job(self, resource_config: dict):
82
+ """
83
+ Submit a new job to the cluster (Queueud). Then attempt to scheduel it.
84
+ """
85
+ job = Job(resource_config)
86
+ self.jobs[job.job_id] = job
87
+ logger.info(f"Submitted job {job.job_id} to the cluster")
88
+ # attempt to schedule the job
89
+ self.schedule_job(job)
90
+ return job.job_id
91
+
92
+ def schedule_job(self, job: Job):
93
+ """
94
+ find a suitable worker for the job. It none, Job remains queued.
95
+ If we want to a auto provision we can actually add a logic here to add a worker if none is available.
96
+ """
97
+ if job.status != JobStatus.QUEUED:
98
+ logger.error(f"Job {job.job_id} is not pending")
99
+ return False
100
+
101
+ # Find worker candidate
102
+ candidate = self.find_idle_worker(job.resource_config)
103
+ if candidate:
104
+ self.assign_job_to_worker(job, candidate)
105
+ else:
106
+ logger.info(f"No worker available for job {job.job_id}")
107
+ # we cn either provision new worker from here and then scehediule the job from here.
108
+
109
+ def find_idle_worker(self, resource_config: dict):
110
+ """
111
+ Find an idle worker that can run the job
112
+ """
113
+ for w in self.workers.values():
114
+ if w.status == WorkerStatus.IDLE:
115
+ # check the resource config
116
+ if w.resource_config == resource_config:
117
+ continue
118
+ return w
119
+ return None
120
+
121
+ def assign_job_to_worker(self, job: Job, worker: Worker):
122
+ """
123
+ Mark the job as running and the worker as Running and 'execute' the job.
124
+ In a real system, we would send a remote command to the worker (eg: gRPC) to execute the job.
125
+ """
126
+ job.worker_id = worker.worker_id
127
+ job.status = JobStatus.RUNNING
128
+ worker.status = WorkerStatus.RUNNING
129
+ worker.current_job_id = job.job_id
130
+ logger.info(f"Assigned job {job.job_id} to worker {worker.worker_id}")
131
+ self._execute_job(job, worker)
132
+
133
+ def _execute_job(self, job: Job, worker: Worker):
134
+ """
135
+ Simulate the remote execution. right now, we jsut sleep for 1s.
136
+ In production, what we we can do is:
137
+ - Open a gRPC connection to the worker
138
+ - pass the job details
139
+ - wait for the compeltion call back
140
+ """
141
+ try:
142
+ logger.info(f"Executing job {job.job_id} on worker {worker.worker_id}")
143
+ time.sleep(
144
+ 1
145
+ ) # Here we can add the actual execution logic, currently it mimics the execution.
146
+
147
+ # mark the job as completed
148
+ job.status = JobStatus.COMPLETED
149
+ job.result = "Job completed successfully"
150
+ logger.info(f"[Cluster Manager] Job {job.job_id} completed successfully")
151
+ except Exception as e:
152
+ job.status = JobStatus.FAILED
153
+ job.result = f"Job failed: {str(e)}"
154
+ logger.error(f"[Cluster Manager] Job {job.job_id} failed: {str(e)}")
155
+ finally:
156
+ worker.status = WorkerStatus.IDLE
157
+ worker.current_job_id = None
158
+
159
+ def get_job_status(self, job_id):
160
+ """
161
+ Get the job details
162
+ """
163
+ job = self.jobs.get(job_id)
164
+ if not job:
165
+ logger.error(f"Job {job_id} not found")
166
+ return None
167
+ return job
168
+
169
+ # this function has retry logic but it's currently fuzzy, we might have to change it.
170
+
171
+ def retry_queued_jobs(self):
172
+ """
173
+ Retry all queued jobs
174
+ """
175
+ for job in self.jobs.values():
176
+ if job.status == JobStatus.QUEUED:
177
+ self.schedule_job(job)
@@ -0,0 +1,18 @@
1
+ from enum import Enum
2
+
3
+
4
+ class WorkerStatus(Enum):
5
+ """Enum representing the status of a worker"""
6
+
7
+ IDLE = "idle"
8
+ RUNNING = "running"
9
+ OFFLINE = "offline"
10
+
11
+
12
+ class JobStatus(Enum):
13
+ """Enum representing the status of a job"""
14
+
15
+ QUEUED = "queued"
16
+ RUNNING = "running"
17
+ COMPLETED = "completed"
18
+ FAILED = "failed"
@@ -0,0 +1,38 @@
1
+ from cluster_manager import ClusterManager
2
+
3
+
4
+ if __name__ == "__main__":
5
+ cm = ClusterManager()
6
+
7
+ # 1) Submit a job with no existing workers (use resource_config dict)
8
+ job_id = cm.submit_job(
9
+ resource_config={"gpu": "H100", "memory": 16, "network_volume": 50}
10
+ )
11
+ print(
12
+ "Job status:", cm.get_job_status(job_id)
13
+ ) # should be QUEUED, no suitable worker
14
+
15
+ # 2) Add a worker that doesn't match the GPU
16
+ w1 = cm.add_worker(
17
+ resource_config={"gpu": "H100", "memory": 16, "network_volume": 50}
18
+ )
19
+ # Re-try scheduling
20
+ cm.retry_queued_jobs()
21
+ print("Job status (still queued):", cm.get_job_status(job_id))
22
+
23
+ # 3) Add a matching worker
24
+ w2 = cm.add_worker(
25
+ resource_config={"gpu": "H100", "memory": 16, "network_volume": 50}
26
+ )
27
+ # Re-try scheduling
28
+ cm.retry_queued_jobs()
29
+ print("Job status (should complete):", cm.get_job_status(job_id))
30
+
31
+ # 4) Submit another job that requires less resources
32
+ job_id2 = cm.submit_job(resource_config={"memory": 8, "network_volume": 10})
33
+ # Should be assigned to w1 if it's idle
34
+ print("Job2 final status:", cm.get_job_status(job_id2))
35
+
36
+ # 5) Show final state of workers
37
+ for worker in cm.list_workers():
38
+ print("Worker:", worker)
@@ -0,0 +1,22 @@
1
+ import uuid
2
+ from dataclass import JobStatus
3
+
4
+
5
+ class Job:
6
+ """Represents a 'job' in the system
7
+
8
+ In a real system, this might contain the function to run,
9
+ arguments, and reference to data or code.
10
+ """
11
+
12
+ def __init__(self, resource_config: dict):
13
+ self.job_id = str(uuid.uuid4())[:8]
14
+ self.resource_config = resource_config
15
+ self.status = JobStatus.QUEUED
16
+
17
+ self.worker_id = None
18
+ self.result = None
19
+ self.error = None
20
+
21
+ def __repr__(self):
22
+ return f"Job(job_id={self.job_id}, status={self.status})"
@@ -0,0 +1,19 @@
1
+ import uuid
2
+ from dataclass import WorkerStatus
3
+
4
+
5
+ class Worker:
6
+ """Represents a single worker in the pool
7
+
8
+ For Now we store ressources in memory
9
+ """
10
+
11
+ def __init__(self, resource_config: dict):
12
+ self.worker_id = str(uuid.uuid4())[:8]
13
+ self.resource_config = resource_config
14
+ self.status = WorkerStatus.IDLE
15
+
16
+ self.current_job_id = None
17
+
18
+ def __repr__(self):
19
+ return f"Worker(worker_id={self.worker_id}, status={self.status})"
@@ -0,0 +1,33 @@
1
+ from .base import BaseResource, DeployableResource
2
+ from .cloud import runpod
3
+ from .cpu import CpuInstanceType
4
+ from .gpu import GpuGroup, GpuType, GpuTypeDetail
5
+ from .resource_manager import ResourceManager
6
+ from .live_serverless import LiveServerless
7
+ from .serverless import (
8
+ CpuServerlessEndpoint,
9
+ ServerlessResource,
10
+ ServerlessEndpoint,
11
+ JobOutput,
12
+ CudaVersion,
13
+ )
14
+ from .template import PodTemplate
15
+
16
+
17
+ __all__ = [
18
+ "runpod",
19
+ "BaseResource",
20
+ "CpuInstanceType",
21
+ "CpuServerlessEndpoint",
22
+ "CudaVersion",
23
+ "DeployableResource",
24
+ "GpuGroup",
25
+ "GpuType",
26
+ "GpuTypeDetail",
27
+ "JobOutput",
28
+ "LiveServerless",
29
+ "ResourceManager",
30
+ "ServerlessResource",
31
+ "ServerlessEndpoint",
32
+ "PodTemplate",
33
+ ]
@@ -0,0 +1,47 @@
1
+ import hashlib
2
+ from abc import ABC, abstractmethod
3
+ from typing import Optional
4
+ from pydantic import BaseModel, ConfigDict
5
+
6
+
7
+ class BaseResource(BaseModel):
8
+ """Base class for all resources."""
9
+
10
+ model_config = ConfigDict(
11
+ validate_by_name=True,
12
+ validate_default=True,
13
+ serialize_by_alias=True,
14
+ )
15
+
16
+ id: Optional[str] = None
17
+
18
+ @property
19
+ def resource_id(self) -> str:
20
+ """Unique resource ID based on configuration."""
21
+ resource_type = self.__class__.__name__
22
+ config_str = self.model_dump_json(exclude_none=True)
23
+ hash_obj = hashlib.md5(f"{resource_type}:{config_str}".encode())
24
+ return f"{resource_type}_{hash_obj.hexdigest()}"
25
+
26
+
27
+ class DeployableResource(BaseResource, ABC):
28
+ """Base class for deployable resources."""
29
+
30
+ def __str__(self) -> str:
31
+ return f"{self.__class__.__name__}"
32
+
33
+ @property
34
+ @abstractmethod
35
+ def url(self) -> str:
36
+ """Public URL of the resource."""
37
+ raise NotImplementedError("Subclasses should implement this method.")
38
+
39
+ @abstractmethod
40
+ def is_deployed(self) -> bool:
41
+ """Check the resource if it's still valid or available."""
42
+ raise NotImplementedError("Subclasses should implement this method.")
43
+
44
+ @abstractmethod
45
+ async def deploy(self) -> "DeployableResource":
46
+ """Deploy the resource."""
47
+ raise NotImplementedError("Subclasses should implement this method.")
@@ -0,0 +1,4 @@
1
+ import os
2
+ import runpod
3
+
4
+ runpod.api_key = os.getenv("RUNPOD_API_KEY")