tetra-rp 0.6.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tetra_rp/__init__.py +109 -19
- tetra_rp/cli/commands/__init__.py +1 -0
- tetra_rp/cli/commands/apps.py +143 -0
- tetra_rp/cli/commands/build.py +1082 -0
- tetra_rp/cli/commands/build_utils/__init__.py +1 -0
- tetra_rp/cli/commands/build_utils/handler_generator.py +176 -0
- tetra_rp/cli/commands/build_utils/lb_handler_generator.py +309 -0
- tetra_rp/cli/commands/build_utils/manifest.py +430 -0
- tetra_rp/cli/commands/build_utils/mothership_handler_generator.py +75 -0
- tetra_rp/cli/commands/build_utils/scanner.py +596 -0
- tetra_rp/cli/commands/deploy.py +580 -0
- tetra_rp/cli/commands/init.py +123 -0
- tetra_rp/cli/commands/resource.py +108 -0
- tetra_rp/cli/commands/run.py +296 -0
- tetra_rp/cli/commands/test_mothership.py +458 -0
- tetra_rp/cli/commands/undeploy.py +533 -0
- tetra_rp/cli/main.py +97 -0
- tetra_rp/cli/utils/__init__.py +1 -0
- tetra_rp/cli/utils/app.py +15 -0
- tetra_rp/cli/utils/conda.py +127 -0
- tetra_rp/cli/utils/deployment.py +530 -0
- tetra_rp/cli/utils/ignore.py +143 -0
- tetra_rp/cli/utils/skeleton.py +184 -0
- tetra_rp/cli/utils/skeleton_template/.env.example +4 -0
- tetra_rp/cli/utils/skeleton_template/.flashignore +40 -0
- tetra_rp/cli/utils/skeleton_template/.gitignore +44 -0
- tetra_rp/cli/utils/skeleton_template/README.md +263 -0
- tetra_rp/cli/utils/skeleton_template/main.py +44 -0
- tetra_rp/cli/utils/skeleton_template/mothership.py +55 -0
- tetra_rp/cli/utils/skeleton_template/pyproject.toml +58 -0
- tetra_rp/cli/utils/skeleton_template/requirements.txt +1 -0
- tetra_rp/cli/utils/skeleton_template/workers/__init__.py +0 -0
- tetra_rp/cli/utils/skeleton_template/workers/cpu/__init__.py +19 -0
- tetra_rp/cli/utils/skeleton_template/workers/cpu/endpoint.py +36 -0
- tetra_rp/cli/utils/skeleton_template/workers/gpu/__init__.py +19 -0
- tetra_rp/cli/utils/skeleton_template/workers/gpu/endpoint.py +61 -0
- tetra_rp/client.py +136 -33
- tetra_rp/config.py +29 -0
- tetra_rp/core/api/runpod.py +591 -39
- tetra_rp/core/deployment.py +232 -0
- tetra_rp/core/discovery.py +425 -0
- tetra_rp/core/exceptions.py +50 -0
- tetra_rp/core/resources/__init__.py +27 -9
- tetra_rp/core/resources/app.py +738 -0
- tetra_rp/core/resources/base.py +139 -4
- tetra_rp/core/resources/constants.py +21 -0
- tetra_rp/core/resources/cpu.py +115 -13
- tetra_rp/core/resources/gpu.py +182 -16
- tetra_rp/core/resources/live_serverless.py +153 -16
- tetra_rp/core/resources/load_balancer_sls_resource.py +440 -0
- tetra_rp/core/resources/network_volume.py +126 -31
- tetra_rp/core/resources/resource_manager.py +436 -35
- tetra_rp/core/resources/serverless.py +537 -120
- tetra_rp/core/resources/serverless_cpu.py +201 -0
- tetra_rp/core/resources/template.py +1 -59
- tetra_rp/core/utils/constants.py +10 -0
- tetra_rp/core/utils/file_lock.py +260 -0
- tetra_rp/core/utils/http.py +67 -0
- tetra_rp/core/utils/lru_cache.py +75 -0
- tetra_rp/core/utils/singleton.py +36 -1
- tetra_rp/core/validation.py +44 -0
- tetra_rp/execute_class.py +301 -0
- tetra_rp/protos/remote_execution.py +98 -9
- tetra_rp/runtime/__init__.py +1 -0
- tetra_rp/runtime/circuit_breaker.py +274 -0
- tetra_rp/runtime/config.py +12 -0
- tetra_rp/runtime/exceptions.py +49 -0
- tetra_rp/runtime/generic_handler.py +206 -0
- tetra_rp/runtime/lb_handler.py +189 -0
- tetra_rp/runtime/load_balancer.py +160 -0
- tetra_rp/runtime/manifest_fetcher.py +192 -0
- tetra_rp/runtime/metrics.py +325 -0
- tetra_rp/runtime/models.py +73 -0
- tetra_rp/runtime/mothership_provisioner.py +512 -0
- tetra_rp/runtime/production_wrapper.py +266 -0
- tetra_rp/runtime/reliability_config.py +149 -0
- tetra_rp/runtime/retry_manager.py +118 -0
- tetra_rp/runtime/serialization.py +124 -0
- tetra_rp/runtime/service_registry.py +346 -0
- tetra_rp/runtime/state_manager_client.py +248 -0
- tetra_rp/stubs/live_serverless.py +35 -17
- tetra_rp/stubs/load_balancer_sls.py +357 -0
- tetra_rp/stubs/registry.py +145 -19
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/METADATA +398 -60
- tetra_rp-0.24.0.dist-info/RECORD +99 -0
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/WHEEL +1 -1
- tetra_rp-0.24.0.dist-info/entry_points.txt +2 -0
- tetra_rp/core/pool/cluster_manager.py +0 -177
- tetra_rp/core/pool/dataclass.py +0 -18
- tetra_rp/core/pool/ex.py +0 -38
- tetra_rp/core/pool/job.py +0 -22
- tetra_rp/core/pool/worker.py +0 -19
- tetra_rp/core/resources/utils.py +0 -50
- tetra_rp/core/utils/json.py +0 -33
- tetra_rp-0.6.0.dist-info/RECORD +0 -39
- /tetra_rp/{core/pool → cli}/__init__.py +0 -0
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Factory for creating FastAPI load-balanced handlers.
|
|
2
|
+
|
|
3
|
+
This module provides the factory function for generating FastAPI applications
|
|
4
|
+
that handle load-balanced serverless endpoints. It supports:
|
|
5
|
+
- User-defined HTTP routes
|
|
6
|
+
- /execute endpoint for @remote function execution (LiveLoadBalancer only)
|
|
7
|
+
|
|
8
|
+
Security Model:
|
|
9
|
+
The /execute endpoint accepts and executes serialized function code. This is
|
|
10
|
+
secure because:
|
|
11
|
+
1. The function code originates from the client's @remote decorator
|
|
12
|
+
2. The client (user) controls what function gets sent
|
|
13
|
+
3. This mirrors the trusted client model of LiveServerlessStub
|
|
14
|
+
4. In production, API authentication should protect the /execute endpoint
|
|
15
|
+
|
|
16
|
+
Users should NOT expose the /execute endpoint to untrusted clients.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import inspect
|
|
20
|
+
import logging
|
|
21
|
+
from typing import Any, Callable, Dict
|
|
22
|
+
|
|
23
|
+
from fastapi import FastAPI, Request
|
|
24
|
+
|
|
25
|
+
from .serialization import (
|
|
26
|
+
deserialize_args,
|
|
27
|
+
deserialize_kwargs,
|
|
28
|
+
serialize_arg,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def create_lb_handler(
|
|
35
|
+
route_registry: Dict[tuple[str, str], Callable],
|
|
36
|
+
include_execute: bool = False,
|
|
37
|
+
lifespan: Callable = None,
|
|
38
|
+
) -> FastAPI:
|
|
39
|
+
"""Create FastAPI app with routes from registry.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
route_registry: Mapping of (HTTP_METHOD, path) -> handler_function
|
|
43
|
+
Example: {("GET", "/api/health"): health_check}
|
|
44
|
+
include_execute: Whether to register /execute endpoint for @remote execution.
|
|
45
|
+
Only used for LiveLoadBalancer (local development).
|
|
46
|
+
Deployed endpoints should not expose /execute for security.
|
|
47
|
+
lifespan: Optional lifespan context manager for startup/shutdown hooks.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Configured FastAPI application with routes registered.
|
|
51
|
+
"""
|
|
52
|
+
app = FastAPI(title="Flash Load-Balanced Handler", lifespan=lifespan)
|
|
53
|
+
|
|
54
|
+
# Register /execute endpoint for @remote stub execution (if enabled)
|
|
55
|
+
if include_execute:
|
|
56
|
+
|
|
57
|
+
@app.post("/execute")
|
|
58
|
+
async def execute_remote_function(request: Request) -> Dict[str, Any]:
|
|
59
|
+
"""Framework endpoint for @remote decorator execution.
|
|
60
|
+
|
|
61
|
+
WARNING: This endpoint is INTERNAL to the Flash framework. It should only be
|
|
62
|
+
called by the @remote stub from tetra_rp.stubs.load_balancer_sls. Exposing
|
|
63
|
+
this endpoint to untrusted clients could allow arbitrary code execution.
|
|
64
|
+
|
|
65
|
+
Accepts serialized function code and arguments, executes them,
|
|
66
|
+
and returns serialized result.
|
|
67
|
+
|
|
68
|
+
Request body:
|
|
69
|
+
{
|
|
70
|
+
"function_name": "process_data",
|
|
71
|
+
"function_code": "def process_data(x, y): return x + y",
|
|
72
|
+
"args": [base64_encoded_arg1, base64_encoded_arg2],
|
|
73
|
+
"kwargs": {"key": base64_encoded_value}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
{
|
|
78
|
+
"success": true,
|
|
79
|
+
"result": base64_encoded_result
|
|
80
|
+
}
|
|
81
|
+
or
|
|
82
|
+
{
|
|
83
|
+
"success": false,
|
|
84
|
+
"error": "error message"
|
|
85
|
+
}
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
body = await request.json()
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.error(f"Failed to parse request body: {e}")
|
|
91
|
+
return {"success": False, "error": f"Invalid request body: {e}"}
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
# Extract function metadata
|
|
95
|
+
function_name = body.get("function_name")
|
|
96
|
+
function_code = body.get("function_code")
|
|
97
|
+
|
|
98
|
+
if not function_name or not function_code:
|
|
99
|
+
return {
|
|
100
|
+
"success": False,
|
|
101
|
+
"error": "Missing function_name or function_code in request",
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# Deserialize arguments
|
|
105
|
+
try:
|
|
106
|
+
args = deserialize_args(body.get("args", []))
|
|
107
|
+
kwargs = deserialize_kwargs(body.get("kwargs", {}))
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(f"Failed to deserialize arguments: {e}")
|
|
110
|
+
return {
|
|
111
|
+
"success": False,
|
|
112
|
+
"error": f"Failed to deserialize arguments: {e}",
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Execute function in isolated namespace
|
|
116
|
+
namespace: Dict[str, Any] = {}
|
|
117
|
+
try:
|
|
118
|
+
exec(function_code, namespace)
|
|
119
|
+
except SyntaxError as e:
|
|
120
|
+
logger.error(f"Syntax error in function code: {e}")
|
|
121
|
+
return {
|
|
122
|
+
"success": False,
|
|
123
|
+
"error": f"Syntax error in function code: {e}",
|
|
124
|
+
}
|
|
125
|
+
except Exception as e:
|
|
126
|
+
logger.error(f"Error executing function code: {e}")
|
|
127
|
+
return {
|
|
128
|
+
"success": False,
|
|
129
|
+
"error": f"Error executing function code: {e}",
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
# Get function from namespace
|
|
133
|
+
if function_name not in namespace:
|
|
134
|
+
return {
|
|
135
|
+
"success": False,
|
|
136
|
+
"error": f"Function '{function_name}' not found in executed code",
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
func = namespace[function_name]
|
|
140
|
+
|
|
141
|
+
# Execute function
|
|
142
|
+
try:
|
|
143
|
+
result = func(*args, **kwargs)
|
|
144
|
+
|
|
145
|
+
# Handle async functions
|
|
146
|
+
if inspect.iscoroutine(result):
|
|
147
|
+
result = await result
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.error(f"Function execution failed: {e}")
|
|
150
|
+
return {
|
|
151
|
+
"success": False,
|
|
152
|
+
"error": f"Function execution failed: {e}",
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
# Serialize result
|
|
156
|
+
try:
|
|
157
|
+
result_b64 = serialize_arg(result)
|
|
158
|
+
return {"success": True, "result": result_b64}
|
|
159
|
+
except Exception as e:
|
|
160
|
+
logger.error(f"Failed to serialize result: {e}")
|
|
161
|
+
return {
|
|
162
|
+
"success": False,
|
|
163
|
+
"error": f"Failed to serialize result: {e}",
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.error(f"Unexpected error in /execute endpoint: {e}")
|
|
168
|
+
return {"success": False, "error": f"Unexpected error: {e}"}
|
|
169
|
+
|
|
170
|
+
# Register user-defined routes from registry
|
|
171
|
+
for (method, path), handler in route_registry.items():
|
|
172
|
+
method_upper = method.upper()
|
|
173
|
+
|
|
174
|
+
if method_upper == "GET":
|
|
175
|
+
app.get(path)(handler)
|
|
176
|
+
elif method_upper == "POST":
|
|
177
|
+
app.post(path)(handler)
|
|
178
|
+
elif method_upper == "PUT":
|
|
179
|
+
app.put(path)(handler)
|
|
180
|
+
elif method_upper == "DELETE":
|
|
181
|
+
app.delete(path)(handler)
|
|
182
|
+
elif method_upper == "PATCH":
|
|
183
|
+
app.patch(path)(handler)
|
|
184
|
+
else:
|
|
185
|
+
logger.warning(
|
|
186
|
+
f"Unsupported HTTP method '{method}' for path '{path}'. Skipping."
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
return app
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""Load balancing strategies for distributed endpoint routing."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import random
|
|
6
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
7
|
+
|
|
8
|
+
from tetra_rp.runtime.reliability_config import LoadBalancerStrategy
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from tetra_rp.runtime.circuit_breaker import CircuitBreakerRegistry
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LoadBalancer:
|
|
17
|
+
"""Load balancer for selecting endpoints using various strategies."""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self, strategy: LoadBalancerStrategy = LoadBalancerStrategy.ROUND_ROBIN
|
|
21
|
+
):
|
|
22
|
+
"""Initialize load balancer.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
strategy: Load balancing strategy to use
|
|
26
|
+
"""
|
|
27
|
+
self.strategy = strategy
|
|
28
|
+
self._round_robin_index = 0
|
|
29
|
+
self._lock = asyncio.Lock()
|
|
30
|
+
self._in_flight_requests: dict[str, int] = {}
|
|
31
|
+
|
|
32
|
+
async def select_endpoint(
|
|
33
|
+
self,
|
|
34
|
+
endpoints: List[str],
|
|
35
|
+
circuit_breaker_registry: Optional["CircuitBreakerRegistry"] = None,
|
|
36
|
+
) -> Optional[str]:
|
|
37
|
+
"""Select an endpoint using configured strategy.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
endpoints: List of available endpoint URLs
|
|
41
|
+
circuit_breaker_registry: Optional circuit breaker registry to check health
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Selected endpoint URL or None if all endpoints are unhealthy
|
|
45
|
+
"""
|
|
46
|
+
if not endpoints:
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
# Filter out unhealthy endpoints if circuit breaker available
|
|
50
|
+
healthy_endpoints = endpoints
|
|
51
|
+
if circuit_breaker_registry is not None:
|
|
52
|
+
from tetra_rp.runtime.circuit_breaker import CircuitState
|
|
53
|
+
|
|
54
|
+
healthy_endpoints = [
|
|
55
|
+
url
|
|
56
|
+
for url in endpoints
|
|
57
|
+
if circuit_breaker_registry.get_state(url) != CircuitState.OPEN
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
if not healthy_endpoints:
|
|
61
|
+
logger.warning(
|
|
62
|
+
f"All {len(endpoints)} endpoints are unhealthy (circuit open)"
|
|
63
|
+
)
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
if self.strategy == LoadBalancerStrategy.ROUND_ROBIN:
|
|
67
|
+
return await self._round_robin_select(healthy_endpoints)
|
|
68
|
+
elif self.strategy == LoadBalancerStrategy.LEAST_CONNECTIONS:
|
|
69
|
+
return await self._least_connections_select(healthy_endpoints)
|
|
70
|
+
elif self.strategy == LoadBalancerStrategy.RANDOM:
|
|
71
|
+
return await self._random_select(healthy_endpoints)
|
|
72
|
+
else:
|
|
73
|
+
# Default to round-robin
|
|
74
|
+
return await self._round_robin_select(healthy_endpoints)
|
|
75
|
+
|
|
76
|
+
async def _round_robin_select(self, endpoints: List[str]) -> str:
|
|
77
|
+
"""Select endpoint using round-robin strategy.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
endpoints: List of available endpoints
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Selected endpoint URL
|
|
84
|
+
"""
|
|
85
|
+
async with self._lock:
|
|
86
|
+
selected = endpoints[self._round_robin_index % len(endpoints)]
|
|
87
|
+
self._round_robin_index += 1
|
|
88
|
+
logger.debug(
|
|
89
|
+
f"Load balancer: ROUND_ROBIN selected {selected} "
|
|
90
|
+
f"(index {self._round_robin_index - 1})"
|
|
91
|
+
)
|
|
92
|
+
return selected
|
|
93
|
+
|
|
94
|
+
async def _least_connections_select(self, endpoints: List[str]) -> str:
|
|
95
|
+
"""Select endpoint with fewest in-flight requests.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
endpoints: List of available endpoints
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Selected endpoint URL
|
|
102
|
+
"""
|
|
103
|
+
async with self._lock:
|
|
104
|
+
# Initialize counts for endpoints
|
|
105
|
+
for endpoint in endpoints:
|
|
106
|
+
if endpoint not in self._in_flight_requests:
|
|
107
|
+
self._in_flight_requests[endpoint] = 0
|
|
108
|
+
|
|
109
|
+
# Find endpoint with minimum connections
|
|
110
|
+
selected = min(endpoints, key=lambda e: self._in_flight_requests.get(e, 0))
|
|
111
|
+
|
|
112
|
+
logger.debug(
|
|
113
|
+
f"Load balancer: LEAST_CONNECTIONS selected {selected} "
|
|
114
|
+
f"({self._in_flight_requests.get(selected, 0)} in-flight)"
|
|
115
|
+
)
|
|
116
|
+
return selected
|
|
117
|
+
|
|
118
|
+
async def _random_select(self, endpoints: List[str]) -> str:
|
|
119
|
+
"""Select endpoint using random strategy.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
endpoints: List of available endpoints
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Selected endpoint URL
|
|
126
|
+
"""
|
|
127
|
+
selected = random.choice(endpoints)
|
|
128
|
+
logger.debug(f"Load balancer: RANDOM selected {selected}")
|
|
129
|
+
return selected
|
|
130
|
+
|
|
131
|
+
async def record_request(self, endpoint: str) -> None:
|
|
132
|
+
"""Record that a request is starting on endpoint.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
endpoint: Endpoint URL
|
|
136
|
+
"""
|
|
137
|
+
async with self._lock:
|
|
138
|
+
self._in_flight_requests[endpoint] = (
|
|
139
|
+
self._in_flight_requests.get(endpoint, 0) + 1
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
async def record_request_complete(self, endpoint: str) -> None:
|
|
143
|
+
"""Record that a request completed on endpoint.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
endpoint: Endpoint URL
|
|
147
|
+
"""
|
|
148
|
+
async with self._lock:
|
|
149
|
+
if endpoint in self._in_flight_requests:
|
|
150
|
+
self._in_flight_requests[endpoint] = max(
|
|
151
|
+
0, self._in_flight_requests[endpoint] - 1
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def get_stats(self) -> dict[str, int]:
|
|
155
|
+
"""Get current in-flight request counts.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Mapping of endpoint URLs to in-flight request counts
|
|
159
|
+
"""
|
|
160
|
+
return dict(self._in_flight_requests)
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Manifest fetcher with RunPod GQL integration and caching.
|
|
2
|
+
|
|
3
|
+
This module provides manifest fetching from RunPod GraphQL API (source of truth)
|
|
4
|
+
with local file caching and fallback.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, Optional
|
|
13
|
+
|
|
14
|
+
from .config import DEFAULT_CACHE_TTL
|
|
15
|
+
from .generic_handler import load_manifest
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ManifestFetcher:
|
|
21
|
+
"""Fetches and caches manifest from RunPod GraphQL API.
|
|
22
|
+
|
|
23
|
+
RunPod's GraphQL API is the source of truth for manifest data. This
|
|
24
|
+
fetcher pulls from it using RunpodGraphQLClient, caches locally, and
|
|
25
|
+
falls back to local file if RunPod API is unavailable.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
cache_ttl: int = DEFAULT_CACHE_TTL,
|
|
31
|
+
manifest_path: Optional[Path] = None,
|
|
32
|
+
):
|
|
33
|
+
"""Initialize manifest fetcher.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
cache_ttl: Cache time-to-live in seconds (default: 300)
|
|
37
|
+
manifest_path: Optional path to local manifest file
|
|
38
|
+
"""
|
|
39
|
+
self.cache_ttl = cache_ttl
|
|
40
|
+
self.manifest_path = manifest_path
|
|
41
|
+
|
|
42
|
+
# Cache state
|
|
43
|
+
self._cached_manifest: Optional[Dict[str, Any]] = None
|
|
44
|
+
self._cache_loaded_at: float = 0
|
|
45
|
+
self._cache_lock = asyncio.Lock()
|
|
46
|
+
|
|
47
|
+
async def get_manifest(
|
|
48
|
+
self,
|
|
49
|
+
mothership_id: Optional[str] = None,
|
|
50
|
+
) -> Optional[Dict[str, Any]]:
|
|
51
|
+
"""Get manifest from cache or fetch from RunPod GraphQL API.
|
|
52
|
+
|
|
53
|
+
Flow:
|
|
54
|
+
1. Check if cached and not expired → return cached
|
|
55
|
+
2. If expired/not cached → fetch from RunPod GraphQL API
|
|
56
|
+
3. Update local flash_manifest.json with fetched data
|
|
57
|
+
4. Cache the result
|
|
58
|
+
5. Return manifest
|
|
59
|
+
|
|
60
|
+
If RunPod GQL fetch fails, falls back to local file.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
mothership_id: Optional mothership endpoint ID for tracking
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Manifest dictionary or None if unavailable
|
|
67
|
+
"""
|
|
68
|
+
async with self._cache_lock:
|
|
69
|
+
now = time.time()
|
|
70
|
+
cache_age = now - self._cache_loaded_at
|
|
71
|
+
|
|
72
|
+
# Return cached if still valid
|
|
73
|
+
if self._cached_manifest and cache_age < self.cache_ttl:
|
|
74
|
+
logger.debug(
|
|
75
|
+
f"Serving cached manifest (age: {cache_age:.1f}s, "
|
|
76
|
+
f"TTL: {self.cache_ttl}s)"
|
|
77
|
+
)
|
|
78
|
+
return self._cached_manifest
|
|
79
|
+
|
|
80
|
+
# Cache expired or not loaded - fetch from RunPod GQL
|
|
81
|
+
logger.debug("Cache expired or empty, fetching from RunPod GraphQL API")
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
# Fetch from RunPod GraphQL API (placeholder)
|
|
85
|
+
manifest = await self._fetch_from_gql(mothership_id)
|
|
86
|
+
|
|
87
|
+
# Update local flash_manifest.json
|
|
88
|
+
if manifest:
|
|
89
|
+
self._update_local_file(manifest)
|
|
90
|
+
|
|
91
|
+
# Update cache
|
|
92
|
+
self._cached_manifest = manifest
|
|
93
|
+
self._cache_loaded_at = now
|
|
94
|
+
|
|
95
|
+
logger.info(
|
|
96
|
+
f"Manifest fetched from RunPod GQL and cached "
|
|
97
|
+
f"({len(manifest.get('resources', {}))} resources)"
|
|
98
|
+
)
|
|
99
|
+
return manifest
|
|
100
|
+
|
|
101
|
+
except NotImplementedError:
|
|
102
|
+
logger.debug(
|
|
103
|
+
"RunPod GQL fetch not implemented, falling back to local file"
|
|
104
|
+
)
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logger.warning(
|
|
107
|
+
f"RunPod GQL fetch failed: {e}, falling back to local file"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Fallback: load from local file
|
|
111
|
+
manifest = load_manifest(self.manifest_path)
|
|
112
|
+
if manifest:
|
|
113
|
+
# Cache the fallback manifest
|
|
114
|
+
self._cached_manifest = manifest
|
|
115
|
+
self._cache_loaded_at = now
|
|
116
|
+
logger.debug("Loaded and cached manifest from local file")
|
|
117
|
+
|
|
118
|
+
return manifest
|
|
119
|
+
|
|
120
|
+
async def _fetch_from_gql(
|
|
121
|
+
self,
|
|
122
|
+
mothership_id: Optional[str] = None,
|
|
123
|
+
) -> Dict[str, Any]:
|
|
124
|
+
"""Fetch manifest from RunPod GraphQL API.
|
|
125
|
+
|
|
126
|
+
TBD: Future implementation will query RunPod's GraphQL API
|
|
127
|
+
to retrieve the manifest configuration.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
mothership_id: Optional mothership endpoint ID
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Manifest dictionary from RunPod GQL
|
|
134
|
+
|
|
135
|
+
Raises:
|
|
136
|
+
NotImplementedError: Placeholder for future implementation
|
|
137
|
+
|
|
138
|
+
Note:
|
|
139
|
+
Future implementation will use RunpodGraphQLClient:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
async with RunpodGraphQLClient() as client:
|
|
143
|
+
query = '''
|
|
144
|
+
query GetManifest($mothershipId: ID!) {
|
|
145
|
+
getManifest(mothershipId: $mothershipId) {
|
|
146
|
+
version
|
|
147
|
+
projectName
|
|
148
|
+
generatedAt
|
|
149
|
+
resources
|
|
150
|
+
functionRegistry
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
'''
|
|
154
|
+
result = await client.execute(query, {"mothershipId": mothership_id})
|
|
155
|
+
return result["data"]["getManifest"]
|
|
156
|
+
```
|
|
157
|
+
"""
|
|
158
|
+
raise NotImplementedError(
|
|
159
|
+
"RunPod manifest query not yet implemented. "
|
|
160
|
+
"Falling back to local flash_manifest.json file."
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _update_local_file(self, manifest: Dict[str, Any]) -> None:
|
|
164
|
+
"""Update local flash_manifest.json with fetched data.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
manifest: Manifest dictionary to write
|
|
168
|
+
"""
|
|
169
|
+
try:
|
|
170
|
+
# Determine file path
|
|
171
|
+
if self.manifest_path:
|
|
172
|
+
file_path = self.manifest_path
|
|
173
|
+
else:
|
|
174
|
+
file_path = Path.cwd() / "flash_manifest.json"
|
|
175
|
+
|
|
176
|
+
# Write manifest to file
|
|
177
|
+
with open(file_path, "w") as f:
|
|
178
|
+
json.dump(manifest, f, indent=2)
|
|
179
|
+
|
|
180
|
+
logger.debug(f"Updated local manifest file: {file_path}")
|
|
181
|
+
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.warning(f"Failed to update local manifest file: {e}")
|
|
184
|
+
# Non-critical error - cached manifest still valid
|
|
185
|
+
|
|
186
|
+
def invalidate_cache(self) -> None:
|
|
187
|
+
"""Manually invalidate the cache.
|
|
188
|
+
|
|
189
|
+
Next get_manifest() call will fetch from GQL.
|
|
190
|
+
"""
|
|
191
|
+
self._cache_loaded_at = 0
|
|
192
|
+
logger.debug("Manifest cache invalidated")
|