tetra-rp 0.6.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tetra_rp/__init__.py +109 -19
- tetra_rp/cli/commands/__init__.py +1 -0
- tetra_rp/cli/commands/apps.py +143 -0
- tetra_rp/cli/commands/build.py +1082 -0
- tetra_rp/cli/commands/build_utils/__init__.py +1 -0
- tetra_rp/cli/commands/build_utils/handler_generator.py +176 -0
- tetra_rp/cli/commands/build_utils/lb_handler_generator.py +309 -0
- tetra_rp/cli/commands/build_utils/manifest.py +430 -0
- tetra_rp/cli/commands/build_utils/mothership_handler_generator.py +75 -0
- tetra_rp/cli/commands/build_utils/scanner.py +596 -0
- tetra_rp/cli/commands/deploy.py +580 -0
- tetra_rp/cli/commands/init.py +123 -0
- tetra_rp/cli/commands/resource.py +108 -0
- tetra_rp/cli/commands/run.py +296 -0
- tetra_rp/cli/commands/test_mothership.py +458 -0
- tetra_rp/cli/commands/undeploy.py +533 -0
- tetra_rp/cli/main.py +97 -0
- tetra_rp/cli/utils/__init__.py +1 -0
- tetra_rp/cli/utils/app.py +15 -0
- tetra_rp/cli/utils/conda.py +127 -0
- tetra_rp/cli/utils/deployment.py +530 -0
- tetra_rp/cli/utils/ignore.py +143 -0
- tetra_rp/cli/utils/skeleton.py +184 -0
- tetra_rp/cli/utils/skeleton_template/.env.example +4 -0
- tetra_rp/cli/utils/skeleton_template/.flashignore +40 -0
- tetra_rp/cli/utils/skeleton_template/.gitignore +44 -0
- tetra_rp/cli/utils/skeleton_template/README.md +263 -0
- tetra_rp/cli/utils/skeleton_template/main.py +44 -0
- tetra_rp/cli/utils/skeleton_template/mothership.py +55 -0
- tetra_rp/cli/utils/skeleton_template/pyproject.toml +58 -0
- tetra_rp/cli/utils/skeleton_template/requirements.txt +1 -0
- tetra_rp/cli/utils/skeleton_template/workers/__init__.py +0 -0
- tetra_rp/cli/utils/skeleton_template/workers/cpu/__init__.py +19 -0
- tetra_rp/cli/utils/skeleton_template/workers/cpu/endpoint.py +36 -0
- tetra_rp/cli/utils/skeleton_template/workers/gpu/__init__.py +19 -0
- tetra_rp/cli/utils/skeleton_template/workers/gpu/endpoint.py +61 -0
- tetra_rp/client.py +136 -33
- tetra_rp/config.py +29 -0
- tetra_rp/core/api/runpod.py +591 -39
- tetra_rp/core/deployment.py +232 -0
- tetra_rp/core/discovery.py +425 -0
- tetra_rp/core/exceptions.py +50 -0
- tetra_rp/core/resources/__init__.py +27 -9
- tetra_rp/core/resources/app.py +738 -0
- tetra_rp/core/resources/base.py +139 -4
- tetra_rp/core/resources/constants.py +21 -0
- tetra_rp/core/resources/cpu.py +115 -13
- tetra_rp/core/resources/gpu.py +182 -16
- tetra_rp/core/resources/live_serverless.py +153 -16
- tetra_rp/core/resources/load_balancer_sls_resource.py +440 -0
- tetra_rp/core/resources/network_volume.py +126 -31
- tetra_rp/core/resources/resource_manager.py +436 -35
- tetra_rp/core/resources/serverless.py +537 -120
- tetra_rp/core/resources/serverless_cpu.py +201 -0
- tetra_rp/core/resources/template.py +1 -59
- tetra_rp/core/utils/constants.py +10 -0
- tetra_rp/core/utils/file_lock.py +260 -0
- tetra_rp/core/utils/http.py +67 -0
- tetra_rp/core/utils/lru_cache.py +75 -0
- tetra_rp/core/utils/singleton.py +36 -1
- tetra_rp/core/validation.py +44 -0
- tetra_rp/execute_class.py +301 -0
- tetra_rp/protos/remote_execution.py +98 -9
- tetra_rp/runtime/__init__.py +1 -0
- tetra_rp/runtime/circuit_breaker.py +274 -0
- tetra_rp/runtime/config.py +12 -0
- tetra_rp/runtime/exceptions.py +49 -0
- tetra_rp/runtime/generic_handler.py +206 -0
- tetra_rp/runtime/lb_handler.py +189 -0
- tetra_rp/runtime/load_balancer.py +160 -0
- tetra_rp/runtime/manifest_fetcher.py +192 -0
- tetra_rp/runtime/metrics.py +325 -0
- tetra_rp/runtime/models.py +73 -0
- tetra_rp/runtime/mothership_provisioner.py +512 -0
- tetra_rp/runtime/production_wrapper.py +266 -0
- tetra_rp/runtime/reliability_config.py +149 -0
- tetra_rp/runtime/retry_manager.py +118 -0
- tetra_rp/runtime/serialization.py +124 -0
- tetra_rp/runtime/service_registry.py +346 -0
- tetra_rp/runtime/state_manager_client.py +248 -0
- tetra_rp/stubs/live_serverless.py +35 -17
- tetra_rp/stubs/load_balancer_sls.py +357 -0
- tetra_rp/stubs/registry.py +145 -19
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/METADATA +398 -60
- tetra_rp-0.24.0.dist-info/RECORD +99 -0
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/WHEEL +1 -1
- tetra_rp-0.24.0.dist-info/entry_points.txt +2 -0
- tetra_rp/core/pool/cluster_manager.py +0 -177
- tetra_rp/core/pool/dataclass.py +0 -18
- tetra_rp/core/pool/ex.py +0 -38
- tetra_rp/core/pool/job.py +0 -22
- tetra_rp/core/pool/worker.py +0 -19
- tetra_rp/core/resources/utils.py +0 -50
- tetra_rp/core/utils/json.py +0 -33
- tetra_rp-0.6.0.dist-info/RECORD +0 -39
- /tetra_rp/{core/pool → cli}/__init__.py +0 -0
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/top_level.txt +0 -0
|
@@ -1,80 +1,481 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from contextlib import asynccontextmanager
|
|
1
3
|
import cloudpickle
|
|
2
4
|
import logging
|
|
3
|
-
from typing import Dict
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
6
|
from pathlib import Path
|
|
5
7
|
|
|
8
|
+
from ..exceptions import RunpodAPIKeyError
|
|
6
9
|
from ..utils.singleton import SingletonMixin
|
|
10
|
+
from ..utils.file_lock import file_lock, FileLockError
|
|
7
11
|
|
|
8
12
|
from .base import DeployableResource
|
|
9
13
|
|
|
10
14
|
|
|
11
15
|
log = logging.getLogger(__name__)
|
|
12
16
|
|
|
13
|
-
#
|
|
14
|
-
|
|
17
|
+
# Directory and file to persist state of resources
|
|
18
|
+
RUNPOD_FLASH_DIR = Path(".runpod")
|
|
19
|
+
RESOURCE_STATE_FILE = RUNPOD_FLASH_DIR / "resources.pkl"
|
|
15
20
|
|
|
16
21
|
|
|
17
22
|
class ResourceManager(SingletonMixin):
|
|
18
23
|
"""Manages dynamic provisioning and tracking of remote resources."""
|
|
19
24
|
|
|
25
|
+
# Class variables shared across all instances (singleton)
|
|
20
26
|
_resources: Dict[str, DeployableResource] = {}
|
|
27
|
+
_resource_configs: Dict[str, str] = {} # Tracks config hashes for drift detection
|
|
28
|
+
_deployment_locks: Dict[str, asyncio.Lock] = {}
|
|
29
|
+
_global_lock: Optional[asyncio.Lock] = None
|
|
30
|
+
_lock_initialized = False
|
|
31
|
+
_resources_initialized = False
|
|
21
32
|
|
|
22
33
|
def __init__(self):
|
|
23
|
-
|
|
34
|
+
# Ensure async locks are initialized properly for the singleton instance
|
|
35
|
+
if not ResourceManager._lock_initialized:
|
|
36
|
+
ResourceManager._global_lock = asyncio.Lock()
|
|
37
|
+
ResourceManager._lock_initialized = True
|
|
38
|
+
|
|
39
|
+
# Load resources immediately on initialization (only once)
|
|
40
|
+
if not ResourceManager._resources_initialized:
|
|
24
41
|
self._load_resources()
|
|
42
|
+
self._migrate_to_name_based_keys() # Auto-migrate legacy resources
|
|
43
|
+
self._refresh_config_hashes() # Refresh config hashes after code changes
|
|
44
|
+
ResourceManager._resources_initialized = True
|
|
25
45
|
|
|
26
46
|
def _load_resources(self) -> Dict[str, DeployableResource]:
|
|
27
|
-
"""Load persisted resource information using
|
|
47
|
+
"""Load persisted resource information using cross-platform file locking."""
|
|
28
48
|
if RESOURCE_STATE_FILE.exists():
|
|
29
49
|
try:
|
|
30
50
|
with open(RESOURCE_STATE_FILE, "rb") as f:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
51
|
+
# Acquire shared lock for reading (cross-platform)
|
|
52
|
+
with file_lock(f, exclusive=False):
|
|
53
|
+
data = cloudpickle.load(f)
|
|
54
|
+
|
|
55
|
+
# Handle both old (dict) and new (tuple) pickle formats
|
|
56
|
+
if isinstance(data, tuple) and len(data) == 2:
|
|
57
|
+
self._resources, self._resource_configs = data
|
|
58
|
+
else:
|
|
59
|
+
# Legacy format: just resources dict
|
|
60
|
+
self._resources = data
|
|
61
|
+
self._resource_configs = {}
|
|
62
|
+
|
|
63
|
+
log.debug(
|
|
64
|
+
f"Loaded {len(self._resources)} saved resources from {RESOURCE_STATE_FILE}:\n"
|
|
65
|
+
f" Keys: {list(self._resources.keys())}"
|
|
66
|
+
)
|
|
67
|
+
except (FileLockError, Exception) as e:
|
|
34
68
|
log.error(f"Failed to load resources from {RESOURCE_STATE_FILE}: {e}")
|
|
35
69
|
return self._resources
|
|
36
70
|
|
|
71
|
+
def _migrate_to_name_based_keys(self) -> None:
|
|
72
|
+
"""Migrate from hash-based keys to name-based keys.
|
|
73
|
+
|
|
74
|
+
Legacy format: {resource_id_hash: resource}
|
|
75
|
+
New format: {ResourceType:name: resource}
|
|
76
|
+
|
|
77
|
+
This enables config drift detection and updates.
|
|
78
|
+
"""
|
|
79
|
+
migrated = {}
|
|
80
|
+
migrated_configs = {}
|
|
81
|
+
|
|
82
|
+
for key, resource in self._resources.items():
|
|
83
|
+
# Check if already using name-based key format
|
|
84
|
+
if ":" in key and not key.startswith(resource.__class__.__name__ + "_"):
|
|
85
|
+
# Already migrated
|
|
86
|
+
migrated[key] = resource
|
|
87
|
+
migrated_configs[key] = self._resource_configs.get(
|
|
88
|
+
key, resource.config_hash
|
|
89
|
+
)
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
# Legacy hash-based key - migrate to name-based
|
|
93
|
+
if hasattr(resource, "get_resource_key"):
|
|
94
|
+
new_key = resource.get_resource_key()
|
|
95
|
+
migrated[new_key] = resource
|
|
96
|
+
migrated_configs[new_key] = resource.config_hash
|
|
97
|
+
log.debug(f"Migrated resource: {key} → {new_key}")
|
|
98
|
+
else:
|
|
99
|
+
# Fallback: keep original key if no name available
|
|
100
|
+
migrated[key] = resource
|
|
101
|
+
migrated_configs[key] = self._resource_configs.get(key, "")
|
|
102
|
+
|
|
103
|
+
if len(migrated) != len(self._resources):
|
|
104
|
+
log.info(f"Migrated {len(self._resources)} resources to name-based keys")
|
|
105
|
+
self._resources = migrated
|
|
106
|
+
self._resource_configs = migrated_configs
|
|
107
|
+
self._save_resources() # Persist migration
|
|
108
|
+
|
|
109
|
+
def _refresh_config_hashes(self) -> None:
|
|
110
|
+
"""Refresh stored config hashes to match current code.
|
|
111
|
+
|
|
112
|
+
This is needed when code changes affect how config_hash is computed
|
|
113
|
+
(e.g., adding field_serializers, changing _input_only sets).
|
|
114
|
+
|
|
115
|
+
Compares stored hash with freshly computed hash. If they differ,
|
|
116
|
+
updates the stored hash to prevent false drift detection.
|
|
117
|
+
"""
|
|
118
|
+
updated = False
|
|
119
|
+
|
|
120
|
+
for key, resource in self._resources.items():
|
|
121
|
+
if not hasattr(resource, "config_hash"):
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
# Compute fresh hash with current code
|
|
125
|
+
fresh_hash = resource.config_hash
|
|
126
|
+
stored_hash = self._resource_configs.get(key, "")
|
|
127
|
+
|
|
128
|
+
# If hashes differ, update stored hash
|
|
129
|
+
if stored_hash != fresh_hash:
|
|
130
|
+
log.debug(
|
|
131
|
+
f"Refreshing config hash for '{key}': "
|
|
132
|
+
f"{stored_hash[:8]}... → {fresh_hash[:8]}..."
|
|
133
|
+
)
|
|
134
|
+
self._resource_configs[key] = fresh_hash
|
|
135
|
+
updated = True
|
|
136
|
+
|
|
137
|
+
# Save if any hashes were updated
|
|
138
|
+
if updated:
|
|
139
|
+
log.info("Refreshed config hashes after code changes")
|
|
140
|
+
self._save_resources()
|
|
141
|
+
|
|
37
142
|
def _save_resources(self) -> None:
|
|
38
|
-
"""Persist state of resources to disk using
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
143
|
+
"""Persist state of resources to disk using cross-platform file locking."""
|
|
144
|
+
try:
|
|
145
|
+
# Ensure directory exists
|
|
146
|
+
RUNPOD_FLASH_DIR.mkdir(parents=True, exist_ok=True)
|
|
147
|
+
|
|
148
|
+
with open(RESOURCE_STATE_FILE, "wb") as f:
|
|
149
|
+
# Acquire exclusive lock for writing (cross-platform)
|
|
150
|
+
with file_lock(f, exclusive=True):
|
|
151
|
+
# Save both resources and config hashes as tuple
|
|
152
|
+
data = (self._resources, self._resource_configs)
|
|
153
|
+
cloudpickle.dump(data, f)
|
|
154
|
+
f.flush() # Ensure data is written to disk
|
|
155
|
+
log.debug(f"Saved resources in {RESOURCE_STATE_FILE}")
|
|
156
|
+
except (FileLockError, Exception) as e:
|
|
157
|
+
log.error(f"Failed to save resources to {RESOURCE_STATE_FILE}: {e}")
|
|
158
|
+
raise
|
|
42
159
|
|
|
43
|
-
def
|
|
44
|
-
"""Add a resource to the manager."""
|
|
160
|
+
def _add_resource(self, uid: str, resource: DeployableResource):
|
|
161
|
+
"""Add a resource to the manager (protected method for internal use)."""
|
|
45
162
|
self._resources[uid] = resource
|
|
163
|
+
self._resource_configs[uid] = resource.config_hash
|
|
46
164
|
self._save_resources()
|
|
47
165
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
"""Remove a resource from the manager."""
|
|
166
|
+
def _remove_resource(self, uid: str):
|
|
167
|
+
"""Remove a resource from the manager (protected method for internal use)."""
|
|
51
168
|
if uid not in self._resources:
|
|
52
169
|
log.warning(f"Resource {uid} not found for removal")
|
|
53
170
|
return
|
|
54
171
|
|
|
55
172
|
del self._resources[uid]
|
|
173
|
+
self._resource_configs.pop(uid, None) # Remove config hash too
|
|
56
174
|
log.debug(f"Removed resource {uid}")
|
|
57
175
|
|
|
58
176
|
self._save_resources()
|
|
59
177
|
|
|
178
|
+
async def register_resource(self, resource: DeployableResource) -> str:
|
|
179
|
+
"""Persist a resource config into pickled state. Not thread safe."""
|
|
180
|
+
uid = resource.resource_id
|
|
181
|
+
self._add_resource(uid, resource)
|
|
182
|
+
return uid
|
|
183
|
+
|
|
184
|
+
async def _deploy_with_error_context(
|
|
185
|
+
self, config: DeployableResource
|
|
186
|
+
) -> DeployableResource:
|
|
187
|
+
"""Deploy resource with enhanced error context for RunpodAPIKeyError.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
config: Resource configuration to deploy.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Deployed resource instance.
|
|
194
|
+
|
|
195
|
+
Raises:
|
|
196
|
+
RunpodAPIKeyError: If deployment fails due to missing API key, with resource context.
|
|
197
|
+
"""
|
|
198
|
+
try:
|
|
199
|
+
return await config._do_deploy()
|
|
200
|
+
except RunpodAPIKeyError as e:
|
|
201
|
+
error_msg = f"Cannot deploy resource '{config.name}': {str(e)}"
|
|
202
|
+
raise RunpodAPIKeyError(error_msg) from e
|
|
203
|
+
|
|
204
|
+
async def get_resource_from_store(self, uid: str):
|
|
205
|
+
return self._resources.get(uid)
|
|
206
|
+
|
|
60
207
|
async def get_or_deploy_resource(
|
|
61
208
|
self, config: DeployableResource
|
|
62
209
|
) -> DeployableResource:
|
|
63
|
-
"""Get existing or
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
210
|
+
"""Get existing, update if config changed, or deploy new resource.
|
|
211
|
+
|
|
212
|
+
Uses name-based identity (ResourceType:name) instead of config hash.
|
|
213
|
+
This enables automatic config drift detection and updates.
|
|
214
|
+
|
|
215
|
+
Flow:
|
|
216
|
+
1. Check if resource with same name exists
|
|
217
|
+
2. If exists, compare config hashes
|
|
218
|
+
3. If config changed, automatically update the endpoint
|
|
219
|
+
4. If no resource exists, deploy new one
|
|
220
|
+
|
|
221
|
+
Thread-safe implementation that prevents concurrent deployments.
|
|
222
|
+
"""
|
|
223
|
+
# Use name-based key instead of hash
|
|
224
|
+
resource_key = config.get_resource_key()
|
|
225
|
+
new_config_hash = config.config_hash
|
|
226
|
+
|
|
227
|
+
log.debug(
|
|
228
|
+
f"get_or_deploy_resource called:\n"
|
|
229
|
+
f" Config type: {type(config).__name__}\n"
|
|
230
|
+
f" Config name: {getattr(config, 'name', 'N/A')}\n"
|
|
231
|
+
f" Resource key: {resource_key}\n"
|
|
232
|
+
f" New config hash: {new_config_hash[:16]}...\n"
|
|
233
|
+
f" Available keys in cache: {list(self._resources.keys())}"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Ensure global lock is initialized
|
|
237
|
+
assert ResourceManager._global_lock is not None, "Global lock not initialized"
|
|
238
|
+
|
|
239
|
+
# Get or create a per-resource lock (use name-based key)
|
|
240
|
+
async with ResourceManager._global_lock:
|
|
241
|
+
if resource_key not in ResourceManager._deployment_locks:
|
|
242
|
+
ResourceManager._deployment_locks[resource_key] = asyncio.Lock()
|
|
243
|
+
resource_lock = ResourceManager._deployment_locks[resource_key]
|
|
244
|
+
|
|
245
|
+
# Acquire per-resource lock
|
|
246
|
+
async with resource_lock:
|
|
247
|
+
existing = self._resources.get(resource_key)
|
|
248
|
+
|
|
249
|
+
if existing:
|
|
250
|
+
log.debug(f"Resource found in cache: {resource_key}")
|
|
251
|
+
# Resource exists - check if still valid
|
|
252
|
+
if not existing.is_deployed():
|
|
253
|
+
log.warning(f"{existing} is no longer valid, redeploying.")
|
|
254
|
+
self._remove_resource(resource_key)
|
|
255
|
+
try:
|
|
256
|
+
deployed_resource = await self._deploy_with_error_context(
|
|
257
|
+
config
|
|
258
|
+
)
|
|
259
|
+
log.info(f"URL: {deployed_resource.url}")
|
|
260
|
+
self._add_resource(resource_key, deployed_resource)
|
|
261
|
+
return deployed_resource
|
|
262
|
+
except Exception:
|
|
263
|
+
# Universal rule: If resource was created (has ID), track it for cleanup
|
|
264
|
+
if hasattr(config, "id") and config.id:
|
|
265
|
+
log.warning(
|
|
266
|
+
f"Deployment failed but resource '{config.name}' was created with ID {config.id}, "
|
|
267
|
+
f"caching for cleanup"
|
|
268
|
+
)
|
|
269
|
+
self._add_resource(resource_key, config)
|
|
270
|
+
raise
|
|
271
|
+
|
|
272
|
+
# Check for config drift
|
|
273
|
+
stored_config_hash = self._resource_configs.get(resource_key, "")
|
|
274
|
+
|
|
275
|
+
if stored_config_hash != new_config_hash:
|
|
276
|
+
# Detailed drift debugging
|
|
277
|
+
log.debug(
|
|
278
|
+
f"DRIFT DEBUG for '{config.name}':\n"
|
|
279
|
+
f" Stored hash: {stored_config_hash}\n"
|
|
280
|
+
f" New hash: {new_config_hash}\n"
|
|
281
|
+
f" Stored resource type: {type(existing).__name__}\n"
|
|
282
|
+
f" New resource type: {type(config).__name__}\n"
|
|
283
|
+
f" Existing config fields: {existing.model_dump(exclude_none=True, exclude={'id'}) if hasattr(existing, 'model_dump') else 'N/A'}\n"
|
|
284
|
+
f" New config fields: {config.model_dump(exclude_none=True, exclude={'id'}) if hasattr(config, 'model_dump') else 'N/A'}"
|
|
285
|
+
)
|
|
286
|
+
log.info(
|
|
287
|
+
f"Config drift detected for '{config.name}': "
|
|
288
|
+
f"Automatically updating endpoint"
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Attempt update (will redeploy if structural changes detected)
|
|
292
|
+
if hasattr(existing, "update"):
|
|
293
|
+
updated_resource = await existing.update(config)
|
|
294
|
+
self._add_resource(resource_key, updated_resource)
|
|
295
|
+
return updated_resource
|
|
296
|
+
else:
|
|
297
|
+
# Fallback: redeploy if update not supported
|
|
298
|
+
log.warning(
|
|
299
|
+
f"{config.name}: Resource type doesn't support updates, "
|
|
300
|
+
"redeploying"
|
|
301
|
+
)
|
|
302
|
+
await existing.undeploy()
|
|
303
|
+
try:
|
|
304
|
+
deployed_resource = await self._deploy_with_error_context(
|
|
305
|
+
config
|
|
306
|
+
)
|
|
307
|
+
log.info(f"URL: {deployed_resource.url}")
|
|
308
|
+
self._add_resource(resource_key, deployed_resource)
|
|
309
|
+
return deployed_resource
|
|
310
|
+
except Exception:
|
|
311
|
+
# Universal rule: If resource was created (has ID), track it for cleanup
|
|
312
|
+
if hasattr(config, "id") and config.id:
|
|
313
|
+
log.warning(
|
|
314
|
+
f"Deployment failed but resource '{config.name}' was created with ID {config.id}, "
|
|
315
|
+
f"caching for cleanup"
|
|
316
|
+
)
|
|
317
|
+
self._add_resource(resource_key, config)
|
|
318
|
+
raise
|
|
319
|
+
|
|
320
|
+
# Config unchanged, reuse existing
|
|
321
|
+
log.debug(f"{existing} exists, reusing (config unchanged)")
|
|
322
|
+
log.info(f"URL: {existing.url}")
|
|
323
|
+
return existing
|
|
324
|
+
|
|
325
|
+
# No existing resource, deploy new one
|
|
326
|
+
log.debug(
|
|
327
|
+
f"Resource NOT found in cache, deploying new: {resource_key}\n"
|
|
328
|
+
f" Searched in keys: {list(self._resources.keys())}"
|
|
329
|
+
)
|
|
330
|
+
try:
|
|
331
|
+
deployed_resource = await self._deploy_with_error_context(config)
|
|
332
|
+
log.info(f"URL: {deployed_resource.url}")
|
|
333
|
+
self._add_resource(resource_key, deployed_resource)
|
|
334
|
+
return deployed_resource
|
|
335
|
+
except Exception:
|
|
336
|
+
# Universal rule: If resource was created (has ID), track it for cleanup
|
|
337
|
+
if hasattr(config, "id") and config.id:
|
|
338
|
+
log.warning(
|
|
339
|
+
f"Deployment failed but resource '{config.name}' was created with ID {config.id}, "
|
|
340
|
+
f"caching for cleanup"
|
|
341
|
+
)
|
|
342
|
+
self._add_resource(resource_key, config)
|
|
343
|
+
raise
|
|
344
|
+
|
|
345
|
+
@asynccontextmanager
|
|
346
|
+
async def resource_lock(self, uid: str):
|
|
347
|
+
# Ensure global lock is initialized (should be done in __init__)
|
|
348
|
+
assert ResourceManager._global_lock is not None, "Global lock not initialized"
|
|
349
|
+
|
|
350
|
+
# Get or create a per-resource lock
|
|
351
|
+
async with ResourceManager._global_lock:
|
|
352
|
+
if uid not in ResourceManager._deployment_locks:
|
|
353
|
+
ResourceManager._deployment_locks[uid] = asyncio.Lock()
|
|
354
|
+
resource_lock = ResourceManager._deployment_locks[uid]
|
|
355
|
+
|
|
356
|
+
async with resource_lock:
|
|
357
|
+
yield
|
|
358
|
+
|
|
359
|
+
def list_all_resources(self) -> Dict[str, DeployableResource]:
|
|
360
|
+
"""List all tracked resources.
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Dictionary of resource_id -> DeployableResource
|
|
364
|
+
"""
|
|
365
|
+
return self._resources.copy()
|
|
366
|
+
|
|
367
|
+
def find_resources_by_name(self, name: str) -> List[Tuple[str, DeployableResource]]:
|
|
368
|
+
"""Find resources matching the given name.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
name: The name to search for (exact match)
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
List of (resource_id, resource) tuples matching the name
|
|
375
|
+
"""
|
|
376
|
+
matches = []
|
|
377
|
+
for uid, resource in self._resources.items():
|
|
378
|
+
if hasattr(resource, "name") and resource.name == name:
|
|
379
|
+
matches.append((uid, resource))
|
|
380
|
+
return matches
|
|
381
|
+
|
|
382
|
+
def find_resources_by_provider_id(
|
|
383
|
+
self, provider_id: str
|
|
384
|
+
) -> List[Tuple[str, DeployableResource]]:
|
|
385
|
+
"""Find resources matching the provider-assigned ID.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
provider_id: The provider resource ID to search for (exact match)
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
List of (resource_id, resource) tuples matching the provider ID
|
|
392
|
+
"""
|
|
393
|
+
matches = []
|
|
394
|
+
for uid, resource in self._resources.items():
|
|
395
|
+
if getattr(resource, "id", None) == provider_id:
|
|
396
|
+
matches.append((uid, resource))
|
|
397
|
+
return matches
|
|
398
|
+
|
|
399
|
+
async def undeploy_resource(
|
|
400
|
+
self,
|
|
401
|
+
resource_id: str,
|
|
402
|
+
resource_name: Optional[str] = None,
|
|
403
|
+
force_remove: bool = False,
|
|
404
|
+
) -> Dict[str, Any]:
|
|
405
|
+
"""Undeploy a resource and remove from tracking.
|
|
406
|
+
|
|
407
|
+
This is the public interface for removing resources. It calls the resource's
|
|
408
|
+
_do_undeploy() method (polymorphic) and removes from tracking on success.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
resource_id: The resource ID to undeploy
|
|
412
|
+
resource_name: Optional human-readable name for error messages
|
|
413
|
+
force_remove: If True, remove from tracking even if undeploy fails.
|
|
414
|
+
Use this for cleanup scenarios where resource is already deleted remotely.
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
Dict with keys:
|
|
418
|
+
- success: bool indicating if undeploy succeeded
|
|
419
|
+
- name: resource name (if available)
|
|
420
|
+
- endpoint_id: resource endpoint ID (if available)
|
|
421
|
+
- message: status message
|
|
422
|
+
"""
|
|
423
|
+
resource = self._resources.get(resource_id)
|
|
424
|
+
log.debug(f"existing resource IDs: {list(self._resources.keys())}")
|
|
425
|
+
|
|
426
|
+
if not resource:
|
|
427
|
+
return {
|
|
428
|
+
"success": False,
|
|
429
|
+
"name": resource_name or "Unknown",
|
|
430
|
+
"endpoint_id": "N/A",
|
|
431
|
+
"message": f"Resource {resource_id} not found in tracking",
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
# Get resource metadata for response
|
|
435
|
+
name = resource_name or getattr(resource, "name", "Unknown")
|
|
436
|
+
endpoint_id = getattr(resource, "id", "N/A")
|
|
437
|
+
|
|
438
|
+
try:
|
|
439
|
+
# Call polymorphic undeploy method
|
|
440
|
+
success = await resource._do_undeploy()
|
|
441
|
+
|
|
442
|
+
if success:
|
|
443
|
+
# Remove from tracking on successful undeploy
|
|
444
|
+
self._remove_resource(resource_id)
|
|
445
|
+
return {
|
|
446
|
+
"success": True,
|
|
447
|
+
"name": name,
|
|
448
|
+
"endpoint_id": endpoint_id,
|
|
449
|
+
"message": f"Successfully undeployed '{name}' ({endpoint_id})",
|
|
450
|
+
}
|
|
451
|
+
else:
|
|
452
|
+
# Force remove if requested (e.g., cleanup of already-deleted resources)
|
|
453
|
+
if force_remove:
|
|
454
|
+
self._remove_resource(resource_id)
|
|
455
|
+
return {
|
|
456
|
+
"success": False,
|
|
457
|
+
"name": name,
|
|
458
|
+
"endpoint_id": endpoint_id,
|
|
459
|
+
"message": f"Failed to undeploy '{name}' ({endpoint_id})",
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
except NotImplementedError as e:
|
|
463
|
+
# Resource type doesn't support undeploy yet
|
|
464
|
+
if force_remove:
|
|
465
|
+
self._remove_resource(resource_id)
|
|
466
|
+
return {
|
|
467
|
+
"success": False,
|
|
468
|
+
"name": name,
|
|
469
|
+
"endpoint_id": endpoint_id,
|
|
470
|
+
"message": f"Cannot undeploy '{name}': {str(e)}",
|
|
471
|
+
}
|
|
472
|
+
except Exception as e:
|
|
473
|
+
# Unexpected error during undeploy (e.g., already deleted remotely)
|
|
474
|
+
if force_remove:
|
|
475
|
+
self._remove_resource(resource_id)
|
|
476
|
+
return {
|
|
477
|
+
"success": False,
|
|
478
|
+
"name": name,
|
|
479
|
+
"endpoint_id": endpoint_id,
|
|
480
|
+
"message": f"Error undeploying '{name}': {str(e)}",
|
|
481
|
+
}
|