tetra-rp 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tetra-rp might be problematic. Click here for more details.
- tetra_rp/__init__.py +43 -0
- tetra_rp/cli/__init__.py +0 -0
- tetra_rp/cli/commands/__init__.py +1 -0
- tetra_rp/cli/commands/build.py +534 -0
- tetra_rp/cli/commands/deploy.py +370 -0
- tetra_rp/cli/commands/init.py +119 -0
- tetra_rp/cli/commands/resource.py +191 -0
- tetra_rp/cli/commands/run.py +100 -0
- tetra_rp/cli/main.py +85 -0
- tetra_rp/cli/utils/__init__.py +1 -0
- tetra_rp/cli/utils/conda.py +127 -0
- tetra_rp/cli/utils/deployment.py +172 -0
- tetra_rp/cli/utils/ignore.py +139 -0
- tetra_rp/cli/utils/skeleton.py +184 -0
- tetra_rp/cli/utils/skeleton_template/.env.example +3 -0
- tetra_rp/cli/utils/skeleton_template/.flashignore +40 -0
- tetra_rp/cli/utils/skeleton_template/.gitignore +44 -0
- tetra_rp/cli/utils/skeleton_template/README.md +256 -0
- tetra_rp/cli/utils/skeleton_template/main.py +43 -0
- tetra_rp/cli/utils/skeleton_template/requirements.txt +1 -0
- tetra_rp/cli/utils/skeleton_template/workers/__init__.py +0 -0
- tetra_rp/cli/utils/skeleton_template/workers/cpu/__init__.py +20 -0
- tetra_rp/cli/utils/skeleton_template/workers/cpu/endpoint.py +38 -0
- tetra_rp/cli/utils/skeleton_template/workers/gpu/__init__.py +20 -0
- tetra_rp/cli/utils/skeleton_template/workers/gpu/endpoint.py +62 -0
- tetra_rp/client.py +128 -0
- tetra_rp/config.py +29 -0
- tetra_rp/core/__init__.py +0 -0
- tetra_rp/core/api/__init__.py +6 -0
- tetra_rp/core/api/runpod.py +319 -0
- tetra_rp/core/exceptions.py +50 -0
- tetra_rp/core/resources/__init__.py +37 -0
- tetra_rp/core/resources/base.py +47 -0
- tetra_rp/core/resources/cloud.py +4 -0
- tetra_rp/core/resources/constants.py +4 -0
- tetra_rp/core/resources/cpu.py +146 -0
- tetra_rp/core/resources/environment.py +41 -0
- tetra_rp/core/resources/gpu.py +68 -0
- tetra_rp/core/resources/live_serverless.py +62 -0
- tetra_rp/core/resources/network_volume.py +148 -0
- tetra_rp/core/resources/resource_manager.py +145 -0
- tetra_rp/core/resources/serverless.py +463 -0
- tetra_rp/core/resources/serverless_cpu.py +162 -0
- tetra_rp/core/resources/template.py +94 -0
- tetra_rp/core/resources/utils.py +50 -0
- tetra_rp/core/utils/__init__.py +0 -0
- tetra_rp/core/utils/backoff.py +43 -0
- tetra_rp/core/utils/constants.py +10 -0
- tetra_rp/core/utils/file_lock.py +260 -0
- tetra_rp/core/utils/json.py +33 -0
- tetra_rp/core/utils/lru_cache.py +75 -0
- tetra_rp/core/utils/singleton.py +21 -0
- tetra_rp/core/validation.py +44 -0
- tetra_rp/execute_class.py +319 -0
- tetra_rp/logger.py +34 -0
- tetra_rp/protos/__init__.py +0 -0
- tetra_rp/protos/remote_execution.py +148 -0
- tetra_rp/stubs/__init__.py +5 -0
- tetra_rp/stubs/live_serverless.py +155 -0
- tetra_rp/stubs/registry.py +117 -0
- tetra_rp/stubs/serverless.py +30 -0
- tetra_rp-0.17.1.dist-info/METADATA +976 -0
- tetra_rp-0.17.1.dist-info/RECORD +66 -0
- tetra_rp-0.17.1.dist-info/WHEEL +5 -0
- tetra_rp-0.17.1.dist-info/entry_points.txt +2 -0
- tetra_rp-0.17.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from tetra_rp import remote, LiveServerless, GpuGroup
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
gpu_config = LiveServerless(
|
|
5
|
+
name="gpu_worker",
|
|
6
|
+
gpus=[GpuGroup.ANY],
|
|
7
|
+
workersMin=0,
|
|
8
|
+
workersMax=3,
|
|
9
|
+
idleTimeout=5,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@remote(resource_config=gpu_config, dependencies=["torch"])
|
|
14
|
+
async def gpu_hello(input_data: dict) -> dict:
|
|
15
|
+
"""Simple GPU worker example with GPU detection."""
|
|
16
|
+
import platform
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
import torch
|
|
21
|
+
|
|
22
|
+
gpu_available = torch.cuda.is_available()
|
|
23
|
+
if gpu_available:
|
|
24
|
+
gpu_name = torch.cuda.get_device_name(0)
|
|
25
|
+
gpu_count = torch.cuda.device_count()
|
|
26
|
+
gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
|
|
27
|
+
else:
|
|
28
|
+
gpu_name = "No GPU detected"
|
|
29
|
+
gpu_count = 0
|
|
30
|
+
gpu_memory = 0
|
|
31
|
+
except Exception as e:
|
|
32
|
+
gpu_available = False
|
|
33
|
+
gpu_name = f"Error detecting GPU: {str(e)}"
|
|
34
|
+
gpu_count = 0
|
|
35
|
+
gpu_memory = 0
|
|
36
|
+
|
|
37
|
+
message = input_data.get("message", "Hello from GPU worker!")
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
"status": "success",
|
|
41
|
+
"message": message,
|
|
42
|
+
"worker_type": "GPU",
|
|
43
|
+
"gpu_info": {
|
|
44
|
+
"available": gpu_available,
|
|
45
|
+
"name": gpu_name,
|
|
46
|
+
"count": gpu_count,
|
|
47
|
+
"memory_gb": round(gpu_memory, 2) if gpu_memory else 0,
|
|
48
|
+
},
|
|
49
|
+
"timestamp": datetime.now().isoformat(),
|
|
50
|
+
"platform": platform.system(),
|
|
51
|
+
"python_version": platform.python_version(),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Test locally with: python -m workers.gpu.endpoint
|
|
56
|
+
if __name__ == "__main__":
|
|
57
|
+
import asyncio
|
|
58
|
+
|
|
59
|
+
test_payload = {"message": "Testing GPU worker"}
|
|
60
|
+
print(f"Testing GPU worker with payload: {test_payload}")
|
|
61
|
+
result = asyncio.run(gpu_hello(test_payload))
|
|
62
|
+
print(f"Result: {result}")
|
tetra_rp/client.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import inspect
|
|
3
|
+
import logging
|
|
4
|
+
from functools import wraps
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from .core.resources import ResourceManager, ServerlessResource
|
|
8
|
+
from .execute_class import create_remote_class
|
|
9
|
+
from .stubs import stub_resource
|
|
10
|
+
|
|
11
|
+
log = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def remote(
|
|
15
|
+
resource_config: ServerlessResource,
|
|
16
|
+
dependencies: Optional[List[str]] = None,
|
|
17
|
+
system_dependencies: Optional[List[str]] = None,
|
|
18
|
+
accelerate_downloads: bool = True,
|
|
19
|
+
local: bool = False,
|
|
20
|
+
**extra,
|
|
21
|
+
):
|
|
22
|
+
"""
|
|
23
|
+
Decorator to enable dynamic resource provisioning and dependency management for serverless functions.
|
|
24
|
+
|
|
25
|
+
This decorator allows a function to be executed in a remote serverless environment, with support for
|
|
26
|
+
dynamic resource provisioning and installation of required dependencies. It can also bypass remote
|
|
27
|
+
execution entirely for local testing.
|
|
28
|
+
|
|
29
|
+
Supports both sync and async function definitions:
|
|
30
|
+
- `def my_function(...)` - Regular synchronous function
|
|
31
|
+
- `async def my_function(...)` - Asynchronous function
|
|
32
|
+
|
|
33
|
+
In both cases, the decorated function returns an awaitable that must be called with `await`.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
resource_config (ServerlessResource): Configuration object specifying the serverless resource
|
|
37
|
+
to be provisioned or used. Not used when local=True.
|
|
38
|
+
dependencies (List[str], optional): A list of pip package names to be installed in the remote
|
|
39
|
+
environment before executing the function. Not used when local=True. Defaults to None.
|
|
40
|
+
system_dependencies (List[str], optional): A list of system packages to be installed in the remote
|
|
41
|
+
environment before executing the function. Not used when local=True. Defaults to None.
|
|
42
|
+
accelerate_downloads (bool, optional): Enable download acceleration for dependencies and models.
|
|
43
|
+
Only applies to remote execution. Defaults to True.
|
|
44
|
+
local (bool, optional): Execute function/class locally instead of provisioning remote servers.
|
|
45
|
+
Returns the unwrapped function/class for direct local execution. Users must ensure all required
|
|
46
|
+
dependencies are already installed in their local environment. Defaults to False.
|
|
47
|
+
extra (dict, optional): Additional parameters for the execution of the resource. Defaults to an empty dict.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Callable: A decorator that wraps the target function, enabling remote execution with the specified
|
|
51
|
+
resource configuration and dependencies, or returns the unwrapped function/class for local execution.
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
```python
|
|
55
|
+
# Async function (recommended style)
|
|
56
|
+
@remote(
|
|
57
|
+
resource_config=my_resource_config,
|
|
58
|
+
dependencies=["torch>=2.0.0"],
|
|
59
|
+
)
|
|
60
|
+
async def gpu_task(data: dict) -> dict:
|
|
61
|
+
import torch
|
|
62
|
+
# GPU processing here
|
|
63
|
+
return {"result": "processed"}
|
|
64
|
+
|
|
65
|
+
# Sync function (also supported)
|
|
66
|
+
@remote(
|
|
67
|
+
resource_config=my_resource_config,
|
|
68
|
+
dependencies=["pandas>=2.0.0"],
|
|
69
|
+
)
|
|
70
|
+
def cpu_task(data: dict) -> dict:
|
|
71
|
+
import pandas as pd
|
|
72
|
+
# CPU processing here
|
|
73
|
+
return {"result": "processed"}
|
|
74
|
+
|
|
75
|
+
# Local execution (testing/development)
|
|
76
|
+
@remote(
|
|
77
|
+
resource_config=my_resource_config,
|
|
78
|
+
dependencies=["numpy", "pandas"], # Only used for remote execution
|
|
79
|
+
local=True,
|
|
80
|
+
)
|
|
81
|
+
async def my_test_function(data):
|
|
82
|
+
# Runs locally - dependencies must be pre-installed
|
|
83
|
+
pass
|
|
84
|
+
```
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def decorator(func_or_class):
|
|
88
|
+
if os.getenv("RUNPOD_POD_ID") or os.getenv("RUNPOD_ENDPOINT_ID"):
|
|
89
|
+
# Worker mode when running on RunPod platform
|
|
90
|
+
return func_or_class
|
|
91
|
+
|
|
92
|
+
# Local execution mode - execute without provisioning remote servers
|
|
93
|
+
if local:
|
|
94
|
+
return func_or_class
|
|
95
|
+
|
|
96
|
+
# Remote execution mode
|
|
97
|
+
if inspect.isclass(func_or_class):
|
|
98
|
+
# Handle class decoration
|
|
99
|
+
return create_remote_class(
|
|
100
|
+
func_or_class,
|
|
101
|
+
resource_config,
|
|
102
|
+
dependencies,
|
|
103
|
+
system_dependencies,
|
|
104
|
+
accelerate_downloads,
|
|
105
|
+
extra,
|
|
106
|
+
)
|
|
107
|
+
else:
|
|
108
|
+
# Handle function decoration
|
|
109
|
+
@wraps(func_or_class)
|
|
110
|
+
async def wrapper(*args, **kwargs):
|
|
111
|
+
resource_manager = ResourceManager()
|
|
112
|
+
remote_resource = await resource_manager.get_or_deploy_resource(
|
|
113
|
+
resource_config
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
stub = stub_resource(remote_resource, **extra)
|
|
117
|
+
return await stub(
|
|
118
|
+
func_or_class,
|
|
119
|
+
dependencies,
|
|
120
|
+
system_dependencies,
|
|
121
|
+
accelerate_downloads,
|
|
122
|
+
*args,
|
|
123
|
+
**kwargs,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
return wrapper
|
|
127
|
+
|
|
128
|
+
return decorator
|
tetra_rp/config.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Configuration management for tetra-rp CLI."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import NamedTuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TetraPaths(NamedTuple):
|
|
8
|
+
"""Paths for tetra-rp configuration and data."""
|
|
9
|
+
|
|
10
|
+
tetra_dir: Path
|
|
11
|
+
config_file: Path
|
|
12
|
+
deployments_file: Path
|
|
13
|
+
|
|
14
|
+
def ensure_tetra_dir(self) -> None:
|
|
15
|
+
"""Ensure the .tetra directory exists."""
|
|
16
|
+
self.tetra_dir.mkdir(exist_ok=True)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_paths() -> TetraPaths:
|
|
20
|
+
"""Get standardized paths for tetra-rp configuration."""
|
|
21
|
+
tetra_dir = Path.cwd() / ".tetra"
|
|
22
|
+
config_file = tetra_dir / "config.json"
|
|
23
|
+
deployments_file = tetra_dir / "deployments.json"
|
|
24
|
+
|
|
25
|
+
return TetraPaths(
|
|
26
|
+
tetra_dir=tetra_dir,
|
|
27
|
+
config_file=config_file,
|
|
28
|
+
deployments_file=deployments_file,
|
|
29
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Direct GraphQL communication with Runpod API.
|
|
3
|
+
Bypasses the outdated runpod-python SDK limitations.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
from typing import Any, Dict, Optional
|
|
10
|
+
|
|
11
|
+
import aiohttp
|
|
12
|
+
|
|
13
|
+
from tetra_rp.core.exceptions import RunpodAPIKeyError
|
|
14
|
+
|
|
15
|
+
log = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
RUNPOD_API_BASE_URL = os.environ.get("RUNPOD_API_BASE_URL", "https://api.runpod.io")
|
|
18
|
+
RUNPOD_REST_API_URL = os.environ.get("RUNPOD_REST_API_URL", "https://rest.runpod.io/v1")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class RunpodGraphQLClient:
|
|
22
|
+
"""
|
|
23
|
+
Runpod GraphQL client for Runpod API.
|
|
24
|
+
Communicates directly with Runpod's GraphQL endpoint without SDK limitations.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
GRAPHQL_URL = f"{RUNPOD_API_BASE_URL}/graphql"
|
|
28
|
+
|
|
29
|
+
def __init__(self, api_key: Optional[str] = None):
|
|
30
|
+
self.api_key = api_key or os.getenv("RUNPOD_API_KEY")
|
|
31
|
+
if not self.api_key:
|
|
32
|
+
raise RunpodAPIKeyError()
|
|
33
|
+
|
|
34
|
+
self.session: Optional[aiohttp.ClientSession] = None
|
|
35
|
+
|
|
36
|
+
async def _get_session(self) -> aiohttp.ClientSession:
|
|
37
|
+
"""Get or create an aiohttp session."""
|
|
38
|
+
if self.session is None or self.session.closed:
|
|
39
|
+
timeout = aiohttp.ClientTimeout(total=300) # 5 minute timeout
|
|
40
|
+
self.session = aiohttp.ClientSession(
|
|
41
|
+
timeout=timeout,
|
|
42
|
+
headers={
|
|
43
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
44
|
+
"Content-Type": "application/json",
|
|
45
|
+
},
|
|
46
|
+
)
|
|
47
|
+
return self.session
|
|
48
|
+
|
|
49
|
+
async def _execute_graphql(
|
|
50
|
+
self, query: str, variables: Optional[Dict[str, Any]] = None
|
|
51
|
+
) -> Dict[str, Any]:
|
|
52
|
+
"""Execute a GraphQL query/mutation."""
|
|
53
|
+
session = await self._get_session()
|
|
54
|
+
|
|
55
|
+
payload = {"query": query, "variables": variables or {}}
|
|
56
|
+
|
|
57
|
+
log.debug(f"GraphQL Query: {query}")
|
|
58
|
+
log.debug(f"GraphQL Variables: {json.dumps(variables, indent=2)}")
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
async with session.post(self.GRAPHQL_URL, json=payload) as response:
|
|
62
|
+
response_data = await response.json()
|
|
63
|
+
|
|
64
|
+
log.debug(f"GraphQL Response Status: {response.status}")
|
|
65
|
+
log.debug(f"GraphQL Response: {json.dumps(response_data, indent=2)}")
|
|
66
|
+
|
|
67
|
+
if response.status >= 400:
|
|
68
|
+
raise Exception(
|
|
69
|
+
f"GraphQL request failed: {response.status} - {response_data}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
if "errors" in response_data:
|
|
73
|
+
errors = response_data["errors"]
|
|
74
|
+
error_msg = "; ".join([e.get("message", str(e)) for e in errors])
|
|
75
|
+
raise Exception(f"GraphQL errors: {error_msg}")
|
|
76
|
+
|
|
77
|
+
return response_data.get("data", {})
|
|
78
|
+
|
|
79
|
+
except aiohttp.ClientError as e:
|
|
80
|
+
log.error(f"HTTP client error: {e}")
|
|
81
|
+
raise Exception(f"HTTP request failed: {e}")
|
|
82
|
+
|
|
83
|
+
async def create_endpoint(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
84
|
+
"""
|
|
85
|
+
Create a serverless endpoint using direct GraphQL mutation.
|
|
86
|
+
Supports both GPU and CPU endpoints with full field support.
|
|
87
|
+
"""
|
|
88
|
+
# GraphQL mutation for saveEndpoint (based on actual schema)
|
|
89
|
+
mutation = """
|
|
90
|
+
mutation saveEndpoint($input: EndpointInput!) {
|
|
91
|
+
saveEndpoint(input: $input) {
|
|
92
|
+
aiKey
|
|
93
|
+
gpuIds
|
|
94
|
+
id
|
|
95
|
+
idleTimeout
|
|
96
|
+
locations
|
|
97
|
+
name
|
|
98
|
+
networkVolumeId
|
|
99
|
+
scalerType
|
|
100
|
+
scalerValue
|
|
101
|
+
templateId
|
|
102
|
+
type
|
|
103
|
+
userId
|
|
104
|
+
version
|
|
105
|
+
workersMax
|
|
106
|
+
workersMin
|
|
107
|
+
workersStandby
|
|
108
|
+
workersPFBTarget
|
|
109
|
+
gpuCount
|
|
110
|
+
allowedCudaVersions
|
|
111
|
+
executionTimeoutMs
|
|
112
|
+
instanceIds
|
|
113
|
+
activeBuildid
|
|
114
|
+
idePodId
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
variables = {"input": input_data}
|
|
120
|
+
|
|
121
|
+
log.debug(
|
|
122
|
+
f"Creating endpoint with GraphQL: {input_data.get('name', 'unnamed')}"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
result = await self._execute_graphql(mutation, variables)
|
|
126
|
+
|
|
127
|
+
if "saveEndpoint" not in result:
|
|
128
|
+
raise Exception("Unexpected GraphQL response structure")
|
|
129
|
+
|
|
130
|
+
endpoint_data = result["saveEndpoint"]
|
|
131
|
+
log.info(
|
|
132
|
+
f"Created endpoint: {endpoint_data.get('id', 'unknown')} - {endpoint_data.get('name', 'unnamed')}"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return endpoint_data
|
|
136
|
+
|
|
137
|
+
async def get_cpu_types(self) -> Dict[str, Any]:
|
|
138
|
+
"""Get available CPU types."""
|
|
139
|
+
query = """
|
|
140
|
+
query getCpuTypes {
|
|
141
|
+
cpuTypes {
|
|
142
|
+
id
|
|
143
|
+
displayName
|
|
144
|
+
manufacturer
|
|
145
|
+
cores
|
|
146
|
+
threadsPerCore
|
|
147
|
+
groupId
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
result = await self._execute_graphql(query)
|
|
153
|
+
return result.get("cpuTypes", [])
|
|
154
|
+
|
|
155
|
+
async def get_gpu_types(
|
|
156
|
+
self, gpu_filter: Optional[Dict[str, Any]] = None
|
|
157
|
+
) -> Dict[str, Any]:
|
|
158
|
+
"""Get available GPU types."""
|
|
159
|
+
query = """
|
|
160
|
+
query getGpuTypes($input: GpuTypeFilter) {
|
|
161
|
+
gpuTypes(input: $input) {
|
|
162
|
+
id
|
|
163
|
+
displayName
|
|
164
|
+
manufacturer
|
|
165
|
+
memoryInGb
|
|
166
|
+
cudaCores
|
|
167
|
+
secureCloud
|
|
168
|
+
communityCloud
|
|
169
|
+
securePrice
|
|
170
|
+
communityPrice
|
|
171
|
+
communitySpotPrice
|
|
172
|
+
secureSpotPrice
|
|
173
|
+
maxGpuCount
|
|
174
|
+
maxGpuCountCommunityCloud
|
|
175
|
+
maxGpuCountSecureCloud
|
|
176
|
+
minPodGpuCount
|
|
177
|
+
nodeGroupGpuSizes
|
|
178
|
+
throughput
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
variables = {"input": gpu_filter} if gpu_filter else {}
|
|
184
|
+
result = await self._execute_graphql(query, variables)
|
|
185
|
+
return result.get("gpuTypes", [])
|
|
186
|
+
|
|
187
|
+
async def get_endpoint(self, endpoint_id: str) -> Dict[str, Any]:
|
|
188
|
+
"""Get endpoint details."""
|
|
189
|
+
# Note: The schema doesn't show a specific endpoint query
|
|
190
|
+
# This would need to be implemented if such query exists
|
|
191
|
+
raise NotImplementedError("Get endpoint query not available in current schema")
|
|
192
|
+
|
|
193
|
+
async def delete_endpoint(self, endpoint_id: str) -> Dict[str, Any]:
|
|
194
|
+
"""Delete a serverless endpoint."""
|
|
195
|
+
mutation = """
|
|
196
|
+
mutation deleteEndpoint($id: String!) {
|
|
197
|
+
deleteEndpoint(id: $id)
|
|
198
|
+
}
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
variables = {"id": endpoint_id}
|
|
202
|
+
log.info(f"Deleting endpoint: {endpoint_id}")
|
|
203
|
+
|
|
204
|
+
result = await self._execute_graphql(mutation, variables)
|
|
205
|
+
return {"success": result.get("deleteEndpoint") is not None}
|
|
206
|
+
|
|
207
|
+
async def close(self):
|
|
208
|
+
"""Close the HTTP session."""
|
|
209
|
+
if self.session and not self.session.closed:
|
|
210
|
+
await self.session.close()
|
|
211
|
+
|
|
212
|
+
async def __aenter__(self):
|
|
213
|
+
return self
|
|
214
|
+
|
|
215
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
216
|
+
await self.close()
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
class RunpodRestClient:
|
|
220
|
+
"""
|
|
221
|
+
Runpod REST client for Runpod API.
|
|
222
|
+
Provides methods to interact with Runpod's REST endpoints.
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
def __init__(self, api_key: Optional[str] = None):
|
|
226
|
+
self.api_key = api_key or os.getenv("RUNPOD_API_KEY")
|
|
227
|
+
if not self.api_key:
|
|
228
|
+
raise RunpodAPIKeyError()
|
|
229
|
+
|
|
230
|
+
self.session: Optional[aiohttp.ClientSession] = None
|
|
231
|
+
|
|
232
|
+
async def _get_session(self) -> aiohttp.ClientSession:
|
|
233
|
+
"""Get or create an aiohttp session."""
|
|
234
|
+
if self.session is None or self.session.closed:
|
|
235
|
+
timeout = aiohttp.ClientTimeout(total=300) # 5 minute timeout
|
|
236
|
+
self.session = aiohttp.ClientSession(
|
|
237
|
+
timeout=timeout,
|
|
238
|
+
headers={
|
|
239
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
240
|
+
"Content-Type": "application/json",
|
|
241
|
+
},
|
|
242
|
+
)
|
|
243
|
+
return self.session
|
|
244
|
+
|
|
245
|
+
async def _execute_rest(
|
|
246
|
+
self, method: str, url: str, data: Optional[Dict[str, Any]] = None
|
|
247
|
+
) -> Dict[str, Any]:
|
|
248
|
+
"""Execute a REST API request."""
|
|
249
|
+
session = await self._get_session()
|
|
250
|
+
|
|
251
|
+
log.debug(f"REST Request: {method} {url}")
|
|
252
|
+
log.debug(f"REST Data: {json.dumps(data, indent=2) if data else 'None'}")
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
async with session.request(method, url, json=data) as response:
|
|
256
|
+
response_data = await response.json()
|
|
257
|
+
|
|
258
|
+
log.debug(f"REST Response Status: {response.status}")
|
|
259
|
+
log.debug(f"REST Response: {json.dumps(response_data, indent=2)}")
|
|
260
|
+
|
|
261
|
+
if response.status >= 400:
|
|
262
|
+
raise Exception(
|
|
263
|
+
f"REST request failed: {response.status} - {response_data}"
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
return response_data
|
|
267
|
+
|
|
268
|
+
except aiohttp.ClientError as e:
|
|
269
|
+
log.error(f"HTTP client error: {e}")
|
|
270
|
+
raise Exception(f"HTTP request failed: {e}")
|
|
271
|
+
|
|
272
|
+
async def create_network_volume(self, payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
273
|
+
"""Create a network volume in Runpod."""
|
|
274
|
+
log.debug(f"Creating network volume: {payload.get('name', 'unnamed')}")
|
|
275
|
+
|
|
276
|
+
result = await self._execute_rest(
|
|
277
|
+
"POST", f"{RUNPOD_REST_API_URL}/networkvolumes", payload
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
log.info(
|
|
281
|
+
f"Created network volume: {result.get('id', 'unknown')} - {result.get('name', 'unnamed')}"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
return result
|
|
285
|
+
|
|
286
|
+
async def list_network_volumes(self) -> Dict[str, Any]:
|
|
287
|
+
"""
|
|
288
|
+
List all network volumes in Runpod.
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
List of network volume objects or dict containing networkVolumes key.
|
|
292
|
+
The API may return either format depending on version.
|
|
293
|
+
"""
|
|
294
|
+
log.debug("Listing network volumes")
|
|
295
|
+
|
|
296
|
+
result = await self._execute_rest(
|
|
297
|
+
"GET", f"{RUNPOD_REST_API_URL}/networkvolumes"
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# Handle both list and dict responses
|
|
301
|
+
if isinstance(result, list):
|
|
302
|
+
volume_count = len(result)
|
|
303
|
+
else:
|
|
304
|
+
volume_count = len(result.get("networkVolumes", []))
|
|
305
|
+
|
|
306
|
+
log.debug(f"Listed {volume_count} network volumes")
|
|
307
|
+
|
|
308
|
+
return result
|
|
309
|
+
|
|
310
|
+
async def close(self):
|
|
311
|
+
"""Close the HTTP session."""
|
|
312
|
+
if self.session and not self.session.closed:
|
|
313
|
+
await self.session.close()
|
|
314
|
+
|
|
315
|
+
async def __aenter__(self):
|
|
316
|
+
return self
|
|
317
|
+
|
|
318
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
319
|
+
await self.close()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Custom exceptions for tetra_rp.
|
|
2
|
+
|
|
3
|
+
Provides clear, actionable error messages for common failure scenarios.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class RunpodAPIKeyError(Exception):
|
|
8
|
+
"""Raised when RUNPOD_API_KEY environment variable is missing or invalid.
|
|
9
|
+
|
|
10
|
+
This exception provides helpful guidance on how to obtain and configure
|
|
11
|
+
the API key required for remote execution and deployment features.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, message: str | None = None):
|
|
15
|
+
"""Initialize with optional custom message.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
message: Optional custom error message. If not provided, uses default.
|
|
19
|
+
"""
|
|
20
|
+
if message is None:
|
|
21
|
+
message = self._default_message()
|
|
22
|
+
super().__init__(message)
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def _default_message() -> str:
|
|
26
|
+
"""Generate default error message with setup instructions.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Formatted error message with actionable steps.
|
|
30
|
+
"""
|
|
31
|
+
return """RUNPOD_API_KEY environment variable is required but not set.
|
|
32
|
+
|
|
33
|
+
To use Flash remote execution features, you need a Runpod API key.
|
|
34
|
+
|
|
35
|
+
Get your API key:
|
|
36
|
+
https://docs.runpod.io/get-started/api-keys
|
|
37
|
+
|
|
38
|
+
Set your API key using one of these methods:
|
|
39
|
+
|
|
40
|
+
1. Environment variable:
|
|
41
|
+
export RUNPOD_API_KEY=your_api_key_here
|
|
42
|
+
|
|
43
|
+
2. In your project's .env file:
|
|
44
|
+
echo "RUNPOD_API_KEY=your_api_key_here" >> .env
|
|
45
|
+
|
|
46
|
+
3. In your shell profile (~/.bashrc, ~/.zshrc):
|
|
47
|
+
echo 'export RUNPOD_API_KEY=your_api_key_here' >> ~/.bashrc
|
|
48
|
+
|
|
49
|
+
Note: If you created a .env file, make sure it's in your current directory
|
|
50
|
+
or project root where Flash can find it."""
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from .base import BaseResource, DeployableResource
|
|
2
|
+
from .cpu import CpuInstanceType
|
|
3
|
+
from .gpu import GpuGroup, GpuType, GpuTypeDetail
|
|
4
|
+
from .resource_manager import ResourceManager
|
|
5
|
+
from .live_serverless import LiveServerless, CpuLiveServerless
|
|
6
|
+
from .serverless import (
|
|
7
|
+
ServerlessResource,
|
|
8
|
+
ServerlessEndpoint,
|
|
9
|
+
JobOutput,
|
|
10
|
+
CudaVersion,
|
|
11
|
+
ServerlessType,
|
|
12
|
+
)
|
|
13
|
+
from .serverless_cpu import CpuServerlessEndpoint
|
|
14
|
+
from .template import PodTemplate
|
|
15
|
+
from .network_volume import NetworkVolume, DataCenter
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"BaseResource",
|
|
20
|
+
"CpuInstanceType",
|
|
21
|
+
"CpuLiveServerless",
|
|
22
|
+
"CpuServerlessEndpoint",
|
|
23
|
+
"CudaVersion",
|
|
24
|
+
"DataCenter",
|
|
25
|
+
"DeployableResource",
|
|
26
|
+
"GpuGroup",
|
|
27
|
+
"GpuType",
|
|
28
|
+
"GpuTypeDetail",
|
|
29
|
+
"JobOutput",
|
|
30
|
+
"LiveServerless",
|
|
31
|
+
"ResourceManager",
|
|
32
|
+
"ServerlessResource",
|
|
33
|
+
"ServerlessEndpoint",
|
|
34
|
+
"ServerlessType",
|
|
35
|
+
"PodTemplate",
|
|
36
|
+
"NetworkVolume",
|
|
37
|
+
]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from pydantic import BaseModel, ConfigDict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BaseResource(BaseModel):
|
|
8
|
+
"""Base class for all resources."""
|
|
9
|
+
|
|
10
|
+
model_config = ConfigDict(
|
|
11
|
+
validate_by_name=True,
|
|
12
|
+
validate_default=True,
|
|
13
|
+
serialize_by_alias=True,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
id: Optional[str] = None
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def resource_id(self) -> str:
|
|
20
|
+
"""Unique resource ID based on configuration."""
|
|
21
|
+
resource_type = self.__class__.__name__
|
|
22
|
+
config_str = self.model_dump_json(exclude_none=True)
|
|
23
|
+
hash_obj = hashlib.md5(f"{resource_type}:{config_str}".encode())
|
|
24
|
+
return f"{resource_type}_{hash_obj.hexdigest()}"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DeployableResource(BaseResource, ABC):
|
|
28
|
+
"""Base class for deployable resources."""
|
|
29
|
+
|
|
30
|
+
def __str__(self) -> str:
|
|
31
|
+
return f"{self.__class__.__name__}"
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def url(self) -> str:
|
|
36
|
+
"""Public URL of the resource."""
|
|
37
|
+
raise NotImplementedError("Subclasses should implement this method.")
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def is_deployed(self) -> bool:
|
|
41
|
+
"""Check the resource if it's still valid or available."""
|
|
42
|
+
raise NotImplementedError("Subclasses should implement this method.")
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
async def deploy(self) -> "DeployableResource":
|
|
46
|
+
"""Deploy the resource."""
|
|
47
|
+
raise NotImplementedError("Subclasses should implement this method.")
|