tetra-rp 0.6.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. tetra_rp/__init__.py +109 -19
  2. tetra_rp/cli/commands/__init__.py +1 -0
  3. tetra_rp/cli/commands/apps.py +143 -0
  4. tetra_rp/cli/commands/build.py +1082 -0
  5. tetra_rp/cli/commands/build_utils/__init__.py +1 -0
  6. tetra_rp/cli/commands/build_utils/handler_generator.py +176 -0
  7. tetra_rp/cli/commands/build_utils/lb_handler_generator.py +309 -0
  8. tetra_rp/cli/commands/build_utils/manifest.py +430 -0
  9. tetra_rp/cli/commands/build_utils/mothership_handler_generator.py +75 -0
  10. tetra_rp/cli/commands/build_utils/scanner.py +596 -0
  11. tetra_rp/cli/commands/deploy.py +580 -0
  12. tetra_rp/cli/commands/init.py +123 -0
  13. tetra_rp/cli/commands/resource.py +108 -0
  14. tetra_rp/cli/commands/run.py +296 -0
  15. tetra_rp/cli/commands/test_mothership.py +458 -0
  16. tetra_rp/cli/commands/undeploy.py +533 -0
  17. tetra_rp/cli/main.py +97 -0
  18. tetra_rp/cli/utils/__init__.py +1 -0
  19. tetra_rp/cli/utils/app.py +15 -0
  20. tetra_rp/cli/utils/conda.py +127 -0
  21. tetra_rp/cli/utils/deployment.py +530 -0
  22. tetra_rp/cli/utils/ignore.py +143 -0
  23. tetra_rp/cli/utils/skeleton.py +184 -0
  24. tetra_rp/cli/utils/skeleton_template/.env.example +4 -0
  25. tetra_rp/cli/utils/skeleton_template/.flashignore +40 -0
  26. tetra_rp/cli/utils/skeleton_template/.gitignore +44 -0
  27. tetra_rp/cli/utils/skeleton_template/README.md +263 -0
  28. tetra_rp/cli/utils/skeleton_template/main.py +44 -0
  29. tetra_rp/cli/utils/skeleton_template/mothership.py +55 -0
  30. tetra_rp/cli/utils/skeleton_template/pyproject.toml +58 -0
  31. tetra_rp/cli/utils/skeleton_template/requirements.txt +1 -0
  32. tetra_rp/cli/utils/skeleton_template/workers/__init__.py +0 -0
  33. tetra_rp/cli/utils/skeleton_template/workers/cpu/__init__.py +19 -0
  34. tetra_rp/cli/utils/skeleton_template/workers/cpu/endpoint.py +36 -0
  35. tetra_rp/cli/utils/skeleton_template/workers/gpu/__init__.py +19 -0
  36. tetra_rp/cli/utils/skeleton_template/workers/gpu/endpoint.py +61 -0
  37. tetra_rp/client.py +136 -33
  38. tetra_rp/config.py +29 -0
  39. tetra_rp/core/api/runpod.py +591 -39
  40. tetra_rp/core/deployment.py +232 -0
  41. tetra_rp/core/discovery.py +425 -0
  42. tetra_rp/core/exceptions.py +50 -0
  43. tetra_rp/core/resources/__init__.py +27 -9
  44. tetra_rp/core/resources/app.py +738 -0
  45. tetra_rp/core/resources/base.py +139 -4
  46. tetra_rp/core/resources/constants.py +21 -0
  47. tetra_rp/core/resources/cpu.py +115 -13
  48. tetra_rp/core/resources/gpu.py +182 -16
  49. tetra_rp/core/resources/live_serverless.py +153 -16
  50. tetra_rp/core/resources/load_balancer_sls_resource.py +440 -0
  51. tetra_rp/core/resources/network_volume.py +126 -31
  52. tetra_rp/core/resources/resource_manager.py +436 -35
  53. tetra_rp/core/resources/serverless.py +537 -120
  54. tetra_rp/core/resources/serverless_cpu.py +201 -0
  55. tetra_rp/core/resources/template.py +1 -59
  56. tetra_rp/core/utils/constants.py +10 -0
  57. tetra_rp/core/utils/file_lock.py +260 -0
  58. tetra_rp/core/utils/http.py +67 -0
  59. tetra_rp/core/utils/lru_cache.py +75 -0
  60. tetra_rp/core/utils/singleton.py +36 -1
  61. tetra_rp/core/validation.py +44 -0
  62. tetra_rp/execute_class.py +301 -0
  63. tetra_rp/protos/remote_execution.py +98 -9
  64. tetra_rp/runtime/__init__.py +1 -0
  65. tetra_rp/runtime/circuit_breaker.py +274 -0
  66. tetra_rp/runtime/config.py +12 -0
  67. tetra_rp/runtime/exceptions.py +49 -0
  68. tetra_rp/runtime/generic_handler.py +206 -0
  69. tetra_rp/runtime/lb_handler.py +189 -0
  70. tetra_rp/runtime/load_balancer.py +160 -0
  71. tetra_rp/runtime/manifest_fetcher.py +192 -0
  72. tetra_rp/runtime/metrics.py +325 -0
  73. tetra_rp/runtime/models.py +73 -0
  74. tetra_rp/runtime/mothership_provisioner.py +512 -0
  75. tetra_rp/runtime/production_wrapper.py +266 -0
  76. tetra_rp/runtime/reliability_config.py +149 -0
  77. tetra_rp/runtime/retry_manager.py +118 -0
  78. tetra_rp/runtime/serialization.py +124 -0
  79. tetra_rp/runtime/service_registry.py +346 -0
  80. tetra_rp/runtime/state_manager_client.py +248 -0
  81. tetra_rp/stubs/live_serverless.py +35 -17
  82. tetra_rp/stubs/load_balancer_sls.py +357 -0
  83. tetra_rp/stubs/registry.py +145 -19
  84. {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/METADATA +398 -60
  85. tetra_rp-0.24.0.dist-info/RECORD +99 -0
  86. {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/WHEEL +1 -1
  87. tetra_rp-0.24.0.dist-info/entry_points.txt +2 -0
  88. tetra_rp/core/pool/cluster_manager.py +0 -177
  89. tetra_rp/core/pool/dataclass.py +0 -18
  90. tetra_rp/core/pool/ex.py +0 -38
  91. tetra_rp/core/pool/job.py +0 -22
  92. tetra_rp/core/pool/worker.py +0 -19
  93. tetra_rp/core/resources/utils.py +0 -50
  94. tetra_rp/core/utils/json.py +0 -33
  95. tetra_rp-0.6.0.dist-info/RECORD +0 -39
  96. /tetra_rp/{core/pool → cli}/__init__.py +0 -0
  97. {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/top_level.txt +0 -0
@@ -1,80 +1,481 @@
1
+ import asyncio
2
+ from contextlib import asynccontextmanager
1
3
  import cloudpickle
2
4
  import logging
3
- from typing import Dict
5
+ from typing import Any, Dict, List, Optional, Tuple
4
6
  from pathlib import Path
5
7
 
8
+ from ..exceptions import RunpodAPIKeyError
6
9
  from ..utils.singleton import SingletonMixin
10
+ from ..utils.file_lock import file_lock, FileLockError
7
11
 
8
12
  from .base import DeployableResource
9
13
 
10
14
 
11
15
  log = logging.getLogger(__name__)
12
16
 
13
- # File to persist state of resources
14
- RESOURCE_STATE_FILE = Path(".tetra_resources.pkl")
17
+ # Directory and file to persist state of resources
18
+ RUNPOD_FLASH_DIR = Path(".runpod")
19
+ RESOURCE_STATE_FILE = RUNPOD_FLASH_DIR / "resources.pkl"
15
20
 
16
21
 
17
22
  class ResourceManager(SingletonMixin):
18
23
  """Manages dynamic provisioning and tracking of remote resources."""
19
24
 
25
+ # Class variables shared across all instances (singleton)
20
26
  _resources: Dict[str, DeployableResource] = {}
27
+ _resource_configs: Dict[str, str] = {} # Tracks config hashes for drift detection
28
+ _deployment_locks: Dict[str, asyncio.Lock] = {}
29
+ _global_lock: Optional[asyncio.Lock] = None
30
+ _lock_initialized = False
31
+ _resources_initialized = False
21
32
 
22
33
  def __init__(self):
23
- if not self._resources:
34
+ # Ensure async locks are initialized properly for the singleton instance
35
+ if not ResourceManager._lock_initialized:
36
+ ResourceManager._global_lock = asyncio.Lock()
37
+ ResourceManager._lock_initialized = True
38
+
39
+ # Load resources immediately on initialization (only once)
40
+ if not ResourceManager._resources_initialized:
24
41
  self._load_resources()
42
+ self._migrate_to_name_based_keys() # Auto-migrate legacy resources
43
+ self._refresh_config_hashes() # Refresh config hashes after code changes
44
+ ResourceManager._resources_initialized = True
25
45
 
26
46
  def _load_resources(self) -> Dict[str, DeployableResource]:
27
- """Load persisted resource information using cloudpickle."""
47
+ """Load persisted resource information using cross-platform file locking."""
28
48
  if RESOURCE_STATE_FILE.exists():
29
49
  try:
30
50
  with open(RESOURCE_STATE_FILE, "rb") as f:
31
- self._resources = cloudpickle.load(f)
32
- log.debug(f"Loaded saved resources from {RESOURCE_STATE_FILE}")
33
- except Exception as e:
51
+ # Acquire shared lock for reading (cross-platform)
52
+ with file_lock(f, exclusive=False):
53
+ data = cloudpickle.load(f)
54
+
55
+ # Handle both old (dict) and new (tuple) pickle formats
56
+ if isinstance(data, tuple) and len(data) == 2:
57
+ self._resources, self._resource_configs = data
58
+ else:
59
+ # Legacy format: just resources dict
60
+ self._resources = data
61
+ self._resource_configs = {}
62
+
63
+ log.debug(
64
+ f"Loaded {len(self._resources)} saved resources from {RESOURCE_STATE_FILE}:\n"
65
+ f" Keys: {list(self._resources.keys())}"
66
+ )
67
+ except (FileLockError, Exception) as e:
34
68
  log.error(f"Failed to load resources from {RESOURCE_STATE_FILE}: {e}")
35
69
  return self._resources
36
70
 
71
+ def _migrate_to_name_based_keys(self) -> None:
72
+ """Migrate from hash-based keys to name-based keys.
73
+
74
+ Legacy format: {resource_id_hash: resource}
75
+ New format: {ResourceType:name: resource}
76
+
77
+ This enables config drift detection and updates.
78
+ """
79
+ migrated = {}
80
+ migrated_configs = {}
81
+
82
+ for key, resource in self._resources.items():
83
+ # Check if already using name-based key format
84
+ if ":" in key and not key.startswith(resource.__class__.__name__ + "_"):
85
+ # Already migrated
86
+ migrated[key] = resource
87
+ migrated_configs[key] = self._resource_configs.get(
88
+ key, resource.config_hash
89
+ )
90
+ continue
91
+
92
+ # Legacy hash-based key - migrate to name-based
93
+ if hasattr(resource, "get_resource_key"):
94
+ new_key = resource.get_resource_key()
95
+ migrated[new_key] = resource
96
+ migrated_configs[new_key] = resource.config_hash
97
+ log.debug(f"Migrated resource: {key} → {new_key}")
98
+ else:
99
+ # Fallback: keep original key if no name available
100
+ migrated[key] = resource
101
+ migrated_configs[key] = self._resource_configs.get(key, "")
102
+
103
+ if len(migrated) != len(self._resources):
104
+ log.info(f"Migrated {len(self._resources)} resources to name-based keys")
105
+ self._resources = migrated
106
+ self._resource_configs = migrated_configs
107
+ self._save_resources() # Persist migration
108
+
109
+ def _refresh_config_hashes(self) -> None:
110
+ """Refresh stored config hashes to match current code.
111
+
112
+ This is needed when code changes affect how config_hash is computed
113
+ (e.g., adding field_serializers, changing _input_only sets).
114
+
115
+ Compares stored hash with freshly computed hash. If they differ,
116
+ updates the stored hash to prevent false drift detection.
117
+ """
118
+ updated = False
119
+
120
+ for key, resource in self._resources.items():
121
+ if not hasattr(resource, "config_hash"):
122
+ continue
123
+
124
+ # Compute fresh hash with current code
125
+ fresh_hash = resource.config_hash
126
+ stored_hash = self._resource_configs.get(key, "")
127
+
128
+ # If hashes differ, update stored hash
129
+ if stored_hash != fresh_hash:
130
+ log.debug(
131
+ f"Refreshing config hash for '{key}': "
132
+ f"{stored_hash[:8]}... → {fresh_hash[:8]}..."
133
+ )
134
+ self._resource_configs[key] = fresh_hash
135
+ updated = True
136
+
137
+ # Save if any hashes were updated
138
+ if updated:
139
+ log.info("Refreshed config hashes after code changes")
140
+ self._save_resources()
141
+
37
142
  def _save_resources(self) -> None:
38
- """Persist state of resources to disk using cloudpickle."""
39
- with open(RESOURCE_STATE_FILE, "wb") as f:
40
- cloudpickle.dump(self._resources, f)
41
- log.debug(f"Saved resources in {RESOURCE_STATE_FILE}")
143
+ """Persist state of resources to disk using cross-platform file locking."""
144
+ try:
145
+ # Ensure directory exists
146
+ RUNPOD_FLASH_DIR.mkdir(parents=True, exist_ok=True)
147
+
148
+ with open(RESOURCE_STATE_FILE, "wb") as f:
149
+ # Acquire exclusive lock for writing (cross-platform)
150
+ with file_lock(f, exclusive=True):
151
+ # Save both resources and config hashes as tuple
152
+ data = (self._resources, self._resource_configs)
153
+ cloudpickle.dump(data, f)
154
+ f.flush() # Ensure data is written to disk
155
+ log.debug(f"Saved resources in {RESOURCE_STATE_FILE}")
156
+ except (FileLockError, Exception) as e:
157
+ log.error(f"Failed to save resources to {RESOURCE_STATE_FILE}: {e}")
158
+ raise
42
159
 
43
- def add_resource(self, uid: str, resource: DeployableResource):
44
- """Add a resource to the manager."""
160
+ def _add_resource(self, uid: str, resource: DeployableResource):
161
+ """Add a resource to the manager (protected method for internal use)."""
45
162
  self._resources[uid] = resource
163
+ self._resource_configs[uid] = resource.config_hash
46
164
  self._save_resources()
47
165
 
48
- # function to check if resource still exists remotely, else remove it
49
- def remove_resource(self, uid: str):
50
- """Remove a resource from the manager."""
166
+ def _remove_resource(self, uid: str):
167
+ """Remove a resource from the manager (protected method for internal use)."""
51
168
  if uid not in self._resources:
52
169
  log.warning(f"Resource {uid} not found for removal")
53
170
  return
54
171
 
55
172
  del self._resources[uid]
173
+ self._resource_configs.pop(uid, None) # Remove config hash too
56
174
  log.debug(f"Removed resource {uid}")
57
175
 
58
176
  self._save_resources()
59
177
 
178
+ async def register_resource(self, resource: DeployableResource) -> str:
179
+ """Persist a resource config into pickled state. Not thread safe."""
180
+ uid = resource.resource_id
181
+ self._add_resource(uid, resource)
182
+ return uid
183
+
184
+ async def _deploy_with_error_context(
185
+ self, config: DeployableResource
186
+ ) -> DeployableResource:
187
+ """Deploy resource with enhanced error context for RunpodAPIKeyError.
188
+
189
+ Args:
190
+ config: Resource configuration to deploy.
191
+
192
+ Returns:
193
+ Deployed resource instance.
194
+
195
+ Raises:
196
+ RunpodAPIKeyError: If deployment fails due to missing API key, with resource context.
197
+ """
198
+ try:
199
+ return await config._do_deploy()
200
+ except RunpodAPIKeyError as e:
201
+ error_msg = f"Cannot deploy resource '{config.name}': {str(e)}"
202
+ raise RunpodAPIKeyError(error_msg) from e
203
+
204
+ async def get_resource_from_store(self, uid: str):
205
+ return self._resources.get(uid)
206
+
60
207
  async def get_or_deploy_resource(
61
208
  self, config: DeployableResource
62
209
  ) -> DeployableResource:
63
- """Get existing or create new resource based on config."""
64
- uid = config.resource_id
65
- if existing := self._resources.get(uid):
66
- if not existing.is_deployed():
67
- log.warning(f"{existing} is no longer valid, redeploying.")
68
- self.remove_resource(uid)
69
- return await self.get_or_deploy_resource(config)
70
-
71
- log.debug(f"{existing} exists, reusing.")
72
- log.info(f"URL: {existing.url}")
73
- return existing
74
-
75
- if deployed_resource := await config.deploy():
76
- log.info(f"URL: {deployed_resource.url}")
77
- self.add_resource(uid, deployed_resource)
78
- return deployed_resource
79
-
80
- raise RuntimeError(f"Deployment failed for resource {uid}")
210
+ """Get existing, update if config changed, or deploy new resource.
211
+
212
+ Uses name-based identity (ResourceType:name) instead of config hash.
213
+ This enables automatic config drift detection and updates.
214
+
215
+ Flow:
216
+ 1. Check if resource with same name exists
217
+ 2. If exists, compare config hashes
218
+ 3. If config changed, automatically update the endpoint
219
+ 4. If no resource exists, deploy new one
220
+
221
+ Thread-safe implementation that prevents concurrent deployments.
222
+ """
223
+ # Use name-based key instead of hash
224
+ resource_key = config.get_resource_key()
225
+ new_config_hash = config.config_hash
226
+
227
+ log.debug(
228
+ f"get_or_deploy_resource called:\n"
229
+ f" Config type: {type(config).__name__}\n"
230
+ f" Config name: {getattr(config, 'name', 'N/A')}\n"
231
+ f" Resource key: {resource_key}\n"
232
+ f" New config hash: {new_config_hash[:16]}...\n"
233
+ f" Available keys in cache: {list(self._resources.keys())}"
234
+ )
235
+
236
+ # Ensure global lock is initialized
237
+ assert ResourceManager._global_lock is not None, "Global lock not initialized"
238
+
239
+ # Get or create a per-resource lock (use name-based key)
240
+ async with ResourceManager._global_lock:
241
+ if resource_key not in ResourceManager._deployment_locks:
242
+ ResourceManager._deployment_locks[resource_key] = asyncio.Lock()
243
+ resource_lock = ResourceManager._deployment_locks[resource_key]
244
+
245
+ # Acquire per-resource lock
246
+ async with resource_lock:
247
+ existing = self._resources.get(resource_key)
248
+
249
+ if existing:
250
+ log.debug(f"Resource found in cache: {resource_key}")
251
+ # Resource exists - check if still valid
252
+ if not existing.is_deployed():
253
+ log.warning(f"{existing} is no longer valid, redeploying.")
254
+ self._remove_resource(resource_key)
255
+ try:
256
+ deployed_resource = await self._deploy_with_error_context(
257
+ config
258
+ )
259
+ log.info(f"URL: {deployed_resource.url}")
260
+ self._add_resource(resource_key, deployed_resource)
261
+ return deployed_resource
262
+ except Exception:
263
+ # Universal rule: If resource was created (has ID), track it for cleanup
264
+ if hasattr(config, "id") and config.id:
265
+ log.warning(
266
+ f"Deployment failed but resource '{config.name}' was created with ID {config.id}, "
267
+ f"caching for cleanup"
268
+ )
269
+ self._add_resource(resource_key, config)
270
+ raise
271
+
272
+ # Check for config drift
273
+ stored_config_hash = self._resource_configs.get(resource_key, "")
274
+
275
+ if stored_config_hash != new_config_hash:
276
+ # Detailed drift debugging
277
+ log.debug(
278
+ f"DRIFT DEBUG for '{config.name}':\n"
279
+ f" Stored hash: {stored_config_hash}\n"
280
+ f" New hash: {new_config_hash}\n"
281
+ f" Stored resource type: {type(existing).__name__}\n"
282
+ f" New resource type: {type(config).__name__}\n"
283
+ f" Existing config fields: {existing.model_dump(exclude_none=True, exclude={'id'}) if hasattr(existing, 'model_dump') else 'N/A'}\n"
284
+ f" New config fields: {config.model_dump(exclude_none=True, exclude={'id'}) if hasattr(config, 'model_dump') else 'N/A'}"
285
+ )
286
+ log.info(
287
+ f"Config drift detected for '{config.name}': "
288
+ f"Automatically updating endpoint"
289
+ )
290
+
291
+ # Attempt update (will redeploy if structural changes detected)
292
+ if hasattr(existing, "update"):
293
+ updated_resource = await existing.update(config)
294
+ self._add_resource(resource_key, updated_resource)
295
+ return updated_resource
296
+ else:
297
+ # Fallback: redeploy if update not supported
298
+ log.warning(
299
+ f"{config.name}: Resource type doesn't support updates, "
300
+ "redeploying"
301
+ )
302
+ await existing.undeploy()
303
+ try:
304
+ deployed_resource = await self._deploy_with_error_context(
305
+ config
306
+ )
307
+ log.info(f"URL: {deployed_resource.url}")
308
+ self._add_resource(resource_key, deployed_resource)
309
+ return deployed_resource
310
+ except Exception:
311
+ # Universal rule: If resource was created (has ID), track it for cleanup
312
+ if hasattr(config, "id") and config.id:
313
+ log.warning(
314
+ f"Deployment failed but resource '{config.name}' was created with ID {config.id}, "
315
+ f"caching for cleanup"
316
+ )
317
+ self._add_resource(resource_key, config)
318
+ raise
319
+
320
+ # Config unchanged, reuse existing
321
+ log.debug(f"{existing} exists, reusing (config unchanged)")
322
+ log.info(f"URL: {existing.url}")
323
+ return existing
324
+
325
+ # No existing resource, deploy new one
326
+ log.debug(
327
+ f"Resource NOT found in cache, deploying new: {resource_key}\n"
328
+ f" Searched in keys: {list(self._resources.keys())}"
329
+ )
330
+ try:
331
+ deployed_resource = await self._deploy_with_error_context(config)
332
+ log.info(f"URL: {deployed_resource.url}")
333
+ self._add_resource(resource_key, deployed_resource)
334
+ return deployed_resource
335
+ except Exception:
336
+ # Universal rule: If resource was created (has ID), track it for cleanup
337
+ if hasattr(config, "id") and config.id:
338
+ log.warning(
339
+ f"Deployment failed but resource '{config.name}' was created with ID {config.id}, "
340
+ f"caching for cleanup"
341
+ )
342
+ self._add_resource(resource_key, config)
343
+ raise
344
+
345
+ @asynccontextmanager
346
+ async def resource_lock(self, uid: str):
347
+ # Ensure global lock is initialized (should be done in __init__)
348
+ assert ResourceManager._global_lock is not None, "Global lock not initialized"
349
+
350
+ # Get or create a per-resource lock
351
+ async with ResourceManager._global_lock:
352
+ if uid not in ResourceManager._deployment_locks:
353
+ ResourceManager._deployment_locks[uid] = asyncio.Lock()
354
+ resource_lock = ResourceManager._deployment_locks[uid]
355
+
356
+ async with resource_lock:
357
+ yield
358
+
359
+ def list_all_resources(self) -> Dict[str, DeployableResource]:
360
+ """List all tracked resources.
361
+
362
+ Returns:
363
+ Dictionary of resource_id -> DeployableResource
364
+ """
365
+ return self._resources.copy()
366
+
367
+ def find_resources_by_name(self, name: str) -> List[Tuple[str, DeployableResource]]:
368
+ """Find resources matching the given name.
369
+
370
+ Args:
371
+ name: The name to search for (exact match)
372
+
373
+ Returns:
374
+ List of (resource_id, resource) tuples matching the name
375
+ """
376
+ matches = []
377
+ for uid, resource in self._resources.items():
378
+ if hasattr(resource, "name") and resource.name == name:
379
+ matches.append((uid, resource))
380
+ return matches
381
+
382
+ def find_resources_by_provider_id(
383
+ self, provider_id: str
384
+ ) -> List[Tuple[str, DeployableResource]]:
385
+ """Find resources matching the provider-assigned ID.
386
+
387
+ Args:
388
+ provider_id: The provider resource ID to search for (exact match)
389
+
390
+ Returns:
391
+ List of (resource_id, resource) tuples matching the provider ID
392
+ """
393
+ matches = []
394
+ for uid, resource in self._resources.items():
395
+ if getattr(resource, "id", None) == provider_id:
396
+ matches.append((uid, resource))
397
+ return matches
398
+
399
+ async def undeploy_resource(
400
+ self,
401
+ resource_id: str,
402
+ resource_name: Optional[str] = None,
403
+ force_remove: bool = False,
404
+ ) -> Dict[str, Any]:
405
+ """Undeploy a resource and remove from tracking.
406
+
407
+ This is the public interface for removing resources. It calls the resource's
408
+ _do_undeploy() method (polymorphic) and removes from tracking on success.
409
+
410
+ Args:
411
+ resource_id: The resource ID to undeploy
412
+ resource_name: Optional human-readable name for error messages
413
+ force_remove: If True, remove from tracking even if undeploy fails.
414
+ Use this for cleanup scenarios where resource is already deleted remotely.
415
+
416
+ Returns:
417
+ Dict with keys:
418
+ - success: bool indicating if undeploy succeeded
419
+ - name: resource name (if available)
420
+ - endpoint_id: resource endpoint ID (if available)
421
+ - message: status message
422
+ """
423
+ resource = self._resources.get(resource_id)
424
+ log.debug(f"existing resource IDs: {list(self._resources.keys())}")
425
+
426
+ if not resource:
427
+ return {
428
+ "success": False,
429
+ "name": resource_name or "Unknown",
430
+ "endpoint_id": "N/A",
431
+ "message": f"Resource {resource_id} not found in tracking",
432
+ }
433
+
434
+ # Get resource metadata for response
435
+ name = resource_name or getattr(resource, "name", "Unknown")
436
+ endpoint_id = getattr(resource, "id", "N/A")
437
+
438
+ try:
439
+ # Call polymorphic undeploy method
440
+ success = await resource._do_undeploy()
441
+
442
+ if success:
443
+ # Remove from tracking on successful undeploy
444
+ self._remove_resource(resource_id)
445
+ return {
446
+ "success": True,
447
+ "name": name,
448
+ "endpoint_id": endpoint_id,
449
+ "message": f"Successfully undeployed '{name}' ({endpoint_id})",
450
+ }
451
+ else:
452
+ # Force remove if requested (e.g., cleanup of already-deleted resources)
453
+ if force_remove:
454
+ self._remove_resource(resource_id)
455
+ return {
456
+ "success": False,
457
+ "name": name,
458
+ "endpoint_id": endpoint_id,
459
+ "message": f"Failed to undeploy '{name}' ({endpoint_id})",
460
+ }
461
+
462
+ except NotImplementedError as e:
463
+ # Resource type doesn't support undeploy yet
464
+ if force_remove:
465
+ self._remove_resource(resource_id)
466
+ return {
467
+ "success": False,
468
+ "name": name,
469
+ "endpoint_id": endpoint_id,
470
+ "message": f"Cannot undeploy '{name}': {str(e)}",
471
+ }
472
+ except Exception as e:
473
+ # Unexpected error during undeploy (e.g., already deleted remotely)
474
+ if force_remove:
475
+ self._remove_resource(resource_id)
476
+ return {
477
+ "success": False,
478
+ "name": name,
479
+ "endpoint_id": endpoint_id,
480
+ "message": f"Error undeploying '{name}': {str(e)}",
481
+ }