tetra-rp 0.6.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tetra_rp/__init__.py +109 -19
- tetra_rp/cli/commands/__init__.py +1 -0
- tetra_rp/cli/commands/apps.py +143 -0
- tetra_rp/cli/commands/build.py +1082 -0
- tetra_rp/cli/commands/build_utils/__init__.py +1 -0
- tetra_rp/cli/commands/build_utils/handler_generator.py +176 -0
- tetra_rp/cli/commands/build_utils/lb_handler_generator.py +309 -0
- tetra_rp/cli/commands/build_utils/manifest.py +430 -0
- tetra_rp/cli/commands/build_utils/mothership_handler_generator.py +75 -0
- tetra_rp/cli/commands/build_utils/scanner.py +596 -0
- tetra_rp/cli/commands/deploy.py +580 -0
- tetra_rp/cli/commands/init.py +123 -0
- tetra_rp/cli/commands/resource.py +108 -0
- tetra_rp/cli/commands/run.py +296 -0
- tetra_rp/cli/commands/test_mothership.py +458 -0
- tetra_rp/cli/commands/undeploy.py +533 -0
- tetra_rp/cli/main.py +97 -0
- tetra_rp/cli/utils/__init__.py +1 -0
- tetra_rp/cli/utils/app.py +15 -0
- tetra_rp/cli/utils/conda.py +127 -0
- tetra_rp/cli/utils/deployment.py +530 -0
- tetra_rp/cli/utils/ignore.py +143 -0
- tetra_rp/cli/utils/skeleton.py +184 -0
- tetra_rp/cli/utils/skeleton_template/.env.example +4 -0
- tetra_rp/cli/utils/skeleton_template/.flashignore +40 -0
- tetra_rp/cli/utils/skeleton_template/.gitignore +44 -0
- tetra_rp/cli/utils/skeleton_template/README.md +263 -0
- tetra_rp/cli/utils/skeleton_template/main.py +44 -0
- tetra_rp/cli/utils/skeleton_template/mothership.py +55 -0
- tetra_rp/cli/utils/skeleton_template/pyproject.toml +58 -0
- tetra_rp/cli/utils/skeleton_template/requirements.txt +1 -0
- tetra_rp/cli/utils/skeleton_template/workers/__init__.py +0 -0
- tetra_rp/cli/utils/skeleton_template/workers/cpu/__init__.py +19 -0
- tetra_rp/cli/utils/skeleton_template/workers/cpu/endpoint.py +36 -0
- tetra_rp/cli/utils/skeleton_template/workers/gpu/__init__.py +19 -0
- tetra_rp/cli/utils/skeleton_template/workers/gpu/endpoint.py +61 -0
- tetra_rp/client.py +136 -33
- tetra_rp/config.py +29 -0
- tetra_rp/core/api/runpod.py +591 -39
- tetra_rp/core/deployment.py +232 -0
- tetra_rp/core/discovery.py +425 -0
- tetra_rp/core/exceptions.py +50 -0
- tetra_rp/core/resources/__init__.py +27 -9
- tetra_rp/core/resources/app.py +738 -0
- tetra_rp/core/resources/base.py +139 -4
- tetra_rp/core/resources/constants.py +21 -0
- tetra_rp/core/resources/cpu.py +115 -13
- tetra_rp/core/resources/gpu.py +182 -16
- tetra_rp/core/resources/live_serverless.py +153 -16
- tetra_rp/core/resources/load_balancer_sls_resource.py +440 -0
- tetra_rp/core/resources/network_volume.py +126 -31
- tetra_rp/core/resources/resource_manager.py +436 -35
- tetra_rp/core/resources/serverless.py +537 -120
- tetra_rp/core/resources/serverless_cpu.py +201 -0
- tetra_rp/core/resources/template.py +1 -59
- tetra_rp/core/utils/constants.py +10 -0
- tetra_rp/core/utils/file_lock.py +260 -0
- tetra_rp/core/utils/http.py +67 -0
- tetra_rp/core/utils/lru_cache.py +75 -0
- tetra_rp/core/utils/singleton.py +36 -1
- tetra_rp/core/validation.py +44 -0
- tetra_rp/execute_class.py +301 -0
- tetra_rp/protos/remote_execution.py +98 -9
- tetra_rp/runtime/__init__.py +1 -0
- tetra_rp/runtime/circuit_breaker.py +274 -0
- tetra_rp/runtime/config.py +12 -0
- tetra_rp/runtime/exceptions.py +49 -0
- tetra_rp/runtime/generic_handler.py +206 -0
- tetra_rp/runtime/lb_handler.py +189 -0
- tetra_rp/runtime/load_balancer.py +160 -0
- tetra_rp/runtime/manifest_fetcher.py +192 -0
- tetra_rp/runtime/metrics.py +325 -0
- tetra_rp/runtime/models.py +73 -0
- tetra_rp/runtime/mothership_provisioner.py +512 -0
- tetra_rp/runtime/production_wrapper.py +266 -0
- tetra_rp/runtime/reliability_config.py +149 -0
- tetra_rp/runtime/retry_manager.py +118 -0
- tetra_rp/runtime/serialization.py +124 -0
- tetra_rp/runtime/service_registry.py +346 -0
- tetra_rp/runtime/state_manager_client.py +248 -0
- tetra_rp/stubs/live_serverless.py +35 -17
- tetra_rp/stubs/load_balancer_sls.py +357 -0
- tetra_rp/stubs/registry.py +145 -19
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/METADATA +398 -60
- tetra_rp-0.24.0.dist-info/RECORD +99 -0
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/WHEEL +1 -1
- tetra_rp-0.24.0.dist-info/entry_points.txt +2 -0
- tetra_rp/core/pool/cluster_manager.py +0 -177
- tetra_rp/core/pool/dataclass.py +0 -18
- tetra_rp/core/pool/ex.py +0 -38
- tetra_rp/core/pool/job.py +0 -22
- tetra_rp/core/pool/worker.py +0 -19
- tetra_rp/core/resources/utils.py +0 -50
- tetra_rp/core/utils/json.py +0 -33
- tetra_rp-0.6.0.dist-info/RECORD +0 -39
- /tetra_rp/{core/pool → cli}/__init__.py +0 -0
- {tetra_rp-0.6.0.dist-info → tetra_rp-0.24.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,512 @@
|
|
|
1
|
+
"""Mothership auto-provisioning logic with manifest reconciliation."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from tetra_rp.core.resources.base import DeployableResource
|
|
12
|
+
from tetra_rp.core.resources.constants import ENDPOINT_DOMAIN
|
|
13
|
+
from tetra_rp.core.resources.resource_manager import ResourceManager
|
|
14
|
+
|
|
15
|
+
from .state_manager_client import StateManagerClient
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class ManifestDiff:
|
|
22
|
+
"""Result of manifest reconciliation."""
|
|
23
|
+
|
|
24
|
+
new: List[str] # Resources to deploy
|
|
25
|
+
changed: List[str] # Resources to update
|
|
26
|
+
removed: List[str] # Resources to delete
|
|
27
|
+
unchanged: List[str] # Resources to skip
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_mothership_url() -> str:
|
|
31
|
+
"""Construct mothership URL from RUNPOD_ENDPOINT_ID env var.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Mothership URL in format: https://{endpoint_id}.{ENDPOINT_DOMAIN}
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
RuntimeError: If RUNPOD_ENDPOINT_ID not set
|
|
38
|
+
"""
|
|
39
|
+
endpoint_id = os.getenv("RUNPOD_ENDPOINT_ID")
|
|
40
|
+
if not endpoint_id:
|
|
41
|
+
raise RuntimeError("RUNPOD_ENDPOINT_ID environment variable not set")
|
|
42
|
+
return f"https://{endpoint_id}.{ENDPOINT_DOMAIN}"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def is_mothership() -> bool:
|
|
46
|
+
"""Check if current endpoint is mothership.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
True if FLASH_IS_MOTHERSHIP env var is 'true'
|
|
50
|
+
"""
|
|
51
|
+
return os.getenv("FLASH_IS_MOTHERSHIP", "").lower() == "true"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def load_manifest(manifest_path: Optional[Path] = None) -> Dict[str, Any]:
|
|
55
|
+
"""Load flash_manifest.json.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
manifest_path: Explicit path to manifest. Tries env var and
|
|
59
|
+
auto-detection if not provided.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Manifest dictionary
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
FileNotFoundError: If manifest not found
|
|
66
|
+
"""
|
|
67
|
+
paths_to_try = []
|
|
68
|
+
|
|
69
|
+
# Explicit path
|
|
70
|
+
if manifest_path:
|
|
71
|
+
paths_to_try.append(manifest_path)
|
|
72
|
+
|
|
73
|
+
# Environment variable
|
|
74
|
+
env_path = os.getenv("FLASH_MANIFEST_PATH")
|
|
75
|
+
if env_path:
|
|
76
|
+
paths_to_try.append(Path(env_path))
|
|
77
|
+
|
|
78
|
+
# Auto-detection: same directory as this file, or cwd
|
|
79
|
+
paths_to_try.extend(
|
|
80
|
+
[
|
|
81
|
+
Path(__file__).parent.parent.parent / "flash_manifest.json",
|
|
82
|
+
Path.cwd() / "flash_manifest.json",
|
|
83
|
+
]
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Try each path
|
|
87
|
+
for path in paths_to_try:
|
|
88
|
+
if path and path.exists():
|
|
89
|
+
try:
|
|
90
|
+
with open(path) as f:
|
|
91
|
+
manifest_dict = json.load(f)
|
|
92
|
+
logger.debug(f"Manifest loaded from {path}")
|
|
93
|
+
return manifest_dict
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.warning(f"Failed to load manifest from {path}: {e}")
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
raise FileNotFoundError(
|
|
99
|
+
f"flash_manifest.json not found. Searched paths: {paths_to_try}"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def compute_resource_hash(resource_data: Dict[str, Any]) -> str:
|
|
104
|
+
"""Compute hash of resource configuration for drift detection.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
resource_data: Resource configuration from manifest
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
SHA-256 hash of resource config
|
|
111
|
+
"""
|
|
112
|
+
# Convert to JSON and hash to detect changes
|
|
113
|
+
config_json = json.dumps(resource_data, sort_keys=True)
|
|
114
|
+
return hashlib.sha256(config_json.encode()).hexdigest()
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def filter_resources_by_manifest(
|
|
118
|
+
all_resources: Dict[str, DeployableResource],
|
|
119
|
+
manifest: Dict[str, Any],
|
|
120
|
+
) -> Dict[str, DeployableResource]:
|
|
121
|
+
"""Filter cached resources to only those defined in manifest.
|
|
122
|
+
|
|
123
|
+
Prevents stale cache entries from being deployed by checking:
|
|
124
|
+
1. Resource name exists in manifest
|
|
125
|
+
2. Resource type matches manifest entry
|
|
126
|
+
|
|
127
|
+
Stale entries can occur when codebase is refactored but the resource
|
|
128
|
+
cache still contains endpoints from an older version.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
all_resources: All resources from ResourceManager cache
|
|
132
|
+
manifest: Current deployment manifest
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Filtered dict containing only manifest-matching resources
|
|
136
|
+
"""
|
|
137
|
+
manifest_resources = manifest.get("resources", {})
|
|
138
|
+
filtered = {}
|
|
139
|
+
removed_count = 0
|
|
140
|
+
|
|
141
|
+
for key, resource in all_resources.items():
|
|
142
|
+
resource_name = resource.name if hasattr(resource, "name") else None
|
|
143
|
+
|
|
144
|
+
if not resource_name:
|
|
145
|
+
logger.warning(f"Skipping cached resource without name: {key}")
|
|
146
|
+
removed_count += 1
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
# Check if resource exists in manifest
|
|
150
|
+
if resource_name not in manifest_resources:
|
|
151
|
+
logger.info(
|
|
152
|
+
f"Removing stale cached resource '{resource_name}' "
|
|
153
|
+
f"(not in current manifest)"
|
|
154
|
+
)
|
|
155
|
+
removed_count += 1
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
# Check if type matches
|
|
159
|
+
manifest_entry = manifest_resources[resource_name]
|
|
160
|
+
expected_type = manifest_entry.get("resource_type")
|
|
161
|
+
actual_type = resource.__class__.__name__
|
|
162
|
+
|
|
163
|
+
if expected_type and expected_type != actual_type:
|
|
164
|
+
logger.warning(
|
|
165
|
+
f"Removing stale cached resource '{resource_name}' "
|
|
166
|
+
f"(type mismatch: cached={actual_type}, manifest={expected_type})"
|
|
167
|
+
)
|
|
168
|
+
removed_count += 1
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
filtered[key] = resource
|
|
172
|
+
|
|
173
|
+
if removed_count > 0:
|
|
174
|
+
logger.info(
|
|
175
|
+
f"Cache validation: Removed {removed_count} stale "
|
|
176
|
+
f"resource(s) not matching manifest"
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
return filtered
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def reconcile_manifests(
|
|
183
|
+
local_manifest: Dict[str, Any],
|
|
184
|
+
persisted_manifest: Optional[Dict[str, Any]],
|
|
185
|
+
) -> ManifestDiff:
|
|
186
|
+
"""Compare local and persisted manifests to detect changes.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
local_manifest: Current manifest from flash_manifest.json
|
|
190
|
+
persisted_manifest: Last known manifest from State Manager (None if first boot)
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
ManifestDiff with categorized resources
|
|
194
|
+
"""
|
|
195
|
+
local_resources = local_manifest.get("resources", {})
|
|
196
|
+
persisted_resources = (
|
|
197
|
+
persisted_manifest.get("resources", {}) if persisted_manifest else {}
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
new = []
|
|
201
|
+
changed = []
|
|
202
|
+
unchanged = []
|
|
203
|
+
|
|
204
|
+
for name, local_data in local_resources.items():
|
|
205
|
+
if name not in persisted_resources:
|
|
206
|
+
new.append(name)
|
|
207
|
+
else:
|
|
208
|
+
# Compare config hashes to detect changes
|
|
209
|
+
local_hash = compute_resource_hash(local_data)
|
|
210
|
+
persisted_hash = persisted_resources[name].get("config_hash")
|
|
211
|
+
|
|
212
|
+
if local_hash != persisted_hash:
|
|
213
|
+
changed.append(name)
|
|
214
|
+
else:
|
|
215
|
+
unchanged.append(name)
|
|
216
|
+
|
|
217
|
+
# Detect removed resources (in persisted, not in local)
|
|
218
|
+
removed = [name for name in persisted_resources if name not in local_resources]
|
|
219
|
+
|
|
220
|
+
return ManifestDiff(new=new, changed=changed, removed=removed, unchanged=unchanged)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def create_resource_from_manifest(
|
|
224
|
+
resource_name: str,
|
|
225
|
+
resource_data: Dict[str, Any],
|
|
226
|
+
mothership_url: str = "",
|
|
227
|
+
flash_environment_id: Optional[str] = None,
|
|
228
|
+
) -> DeployableResource:
|
|
229
|
+
"""Create DeployableResource config from manifest entry.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
resource_name: Name of the resource
|
|
233
|
+
resource_data: Resource configuration from manifest
|
|
234
|
+
mothership_url: Optional mothership URL (for future use with child env vars)
|
|
235
|
+
flash_environment_id: Optional flash environment ID to attach
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
Configured DeployableResource ready for deployment
|
|
239
|
+
|
|
240
|
+
Raises:
|
|
241
|
+
ValueError: If resource type not supported
|
|
242
|
+
"""
|
|
243
|
+
from tetra_rp.core.resources.live_serverless import (
|
|
244
|
+
CpuLiveLoadBalancer,
|
|
245
|
+
CpuLiveServerless,
|
|
246
|
+
LiveLoadBalancer,
|
|
247
|
+
LiveServerless,
|
|
248
|
+
)
|
|
249
|
+
from tetra_rp.core.resources.load_balancer_sls_resource import (
|
|
250
|
+
LoadBalancerSlsResource,
|
|
251
|
+
)
|
|
252
|
+
from tetra_rp.core.resources.serverless import ServerlessResource
|
|
253
|
+
|
|
254
|
+
resource_type = resource_data.get("resource_type", "ServerlessResource")
|
|
255
|
+
|
|
256
|
+
# Support both Serverless and LoadBalancer resource types
|
|
257
|
+
if resource_type not in [
|
|
258
|
+
"ServerlessResource",
|
|
259
|
+
"LiveServerless",
|
|
260
|
+
"CpuLiveServerless",
|
|
261
|
+
"LoadBalancerSlsResource",
|
|
262
|
+
"LiveLoadBalancer",
|
|
263
|
+
"CpuLiveLoadBalancer",
|
|
264
|
+
]:
|
|
265
|
+
raise ValueError(
|
|
266
|
+
f"Unsupported resource type for auto-provisioning: {resource_type}"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Create resource with mothership environment variables
|
|
270
|
+
# Manifest now includes deployment config (imageName, templateId, GPU/worker settings)
|
|
271
|
+
# This enables auto-provisioning to create valid resource configurations
|
|
272
|
+
|
|
273
|
+
# Create appropriate resource type based on manifest entry
|
|
274
|
+
import os
|
|
275
|
+
|
|
276
|
+
env = {
|
|
277
|
+
"FLASH_RESOURCE_NAME": resource_name,
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
# Only set FLASH_MOTHERSHIP_ID when running in mothership context
|
|
281
|
+
# (i.e., when RUNPOD_ENDPOINT_ID is available).
|
|
282
|
+
# During CLI provisioning, RUNPOD_ENDPOINT_ID is not set, so we don't
|
|
283
|
+
# include FLASH_MOTHERSHIP_ID. This avoids Pydantic validation errors
|
|
284
|
+
# (missing keys are fine, None values are not).
|
|
285
|
+
mothership_id = os.getenv("RUNPOD_ENDPOINT_ID")
|
|
286
|
+
if mothership_id:
|
|
287
|
+
env["FLASH_MOTHERSHIP_ID"] = mothership_id
|
|
288
|
+
|
|
289
|
+
# Mothership-specific environment variables
|
|
290
|
+
if resource_data.get("is_mothership"):
|
|
291
|
+
env["FLASH_IS_MOTHERSHIP"] = "true"
|
|
292
|
+
if "main_file" in resource_data:
|
|
293
|
+
env["FLASH_MAIN_FILE"] = resource_data["main_file"]
|
|
294
|
+
if "app_variable" in resource_data:
|
|
295
|
+
env["FLASH_APP_VARIABLE"] = resource_data["app_variable"]
|
|
296
|
+
|
|
297
|
+
# Add "tmp-" prefix for test-mothership deployments
|
|
298
|
+
# Check environment variable set by test-mothership command
|
|
299
|
+
|
|
300
|
+
is_test_mothership = os.getenv("FLASH_IS_TEST_MOTHERSHIP", "").lower() == "true"
|
|
301
|
+
|
|
302
|
+
if is_test_mothership and not resource_name.startswith("tmp-"):
|
|
303
|
+
prefixed_name = f"tmp-{resource_name}"
|
|
304
|
+
logger.info(f"Test mode: Using temporary name '{prefixed_name}'")
|
|
305
|
+
else:
|
|
306
|
+
prefixed_name = resource_name
|
|
307
|
+
|
|
308
|
+
# Extract deployment config from manifest
|
|
309
|
+
deployment_kwargs = {"name": prefixed_name, "env": env}
|
|
310
|
+
|
|
311
|
+
if flash_environment_id:
|
|
312
|
+
deployment_kwargs["flashEnvironmentId"] = flash_environment_id
|
|
313
|
+
|
|
314
|
+
# Add imageName or templateId if present (required for validation)
|
|
315
|
+
if "imageName" in resource_data:
|
|
316
|
+
deployment_kwargs["imageName"] = resource_data["imageName"]
|
|
317
|
+
elif "templateId" in resource_data:
|
|
318
|
+
deployment_kwargs["templateId"] = resource_data["templateId"]
|
|
319
|
+
|
|
320
|
+
# Optional: Add GPU/worker config if present
|
|
321
|
+
if "gpuIds" in resource_data:
|
|
322
|
+
deployment_kwargs["gpuIds"] = resource_data["gpuIds"]
|
|
323
|
+
if "workersMin" in resource_data:
|
|
324
|
+
deployment_kwargs["workersMin"] = resource_data["workersMin"]
|
|
325
|
+
if "workersMax" in resource_data:
|
|
326
|
+
deployment_kwargs["workersMax"] = resource_data["workersMax"]
|
|
327
|
+
|
|
328
|
+
# Note: template is extracted but not passed to resource constructor
|
|
329
|
+
# Let resources create their own templates with proper initialization
|
|
330
|
+
# Templates are created by resource's _create_new_template() method
|
|
331
|
+
|
|
332
|
+
# Create resource with full deployment config
|
|
333
|
+
if resource_type == "CpuLiveLoadBalancer":
|
|
334
|
+
resource = CpuLiveLoadBalancer(**deployment_kwargs)
|
|
335
|
+
elif resource_type == "CpuLiveServerless":
|
|
336
|
+
resource = CpuLiveServerless(**deployment_kwargs)
|
|
337
|
+
elif resource_type == "LiveLoadBalancer":
|
|
338
|
+
resource = LiveLoadBalancer(**deployment_kwargs)
|
|
339
|
+
elif resource_type == "LiveServerless":
|
|
340
|
+
resource = LiveServerless(**deployment_kwargs)
|
|
341
|
+
elif resource_type == "LoadBalancerSlsResource":
|
|
342
|
+
resource = LoadBalancerSlsResource(**deployment_kwargs)
|
|
343
|
+
else:
|
|
344
|
+
# ServerlessResource (default)
|
|
345
|
+
resource = ServerlessResource(**deployment_kwargs)
|
|
346
|
+
|
|
347
|
+
return resource
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
async def reconcile_children(
|
|
351
|
+
manifest_path: Path,
|
|
352
|
+
mothership_url: str,
|
|
353
|
+
state_client: StateManagerClient,
|
|
354
|
+
) -> None:
|
|
355
|
+
"""Reconcile all child resources based on manifest differences.
|
|
356
|
+
|
|
357
|
+
Orchestrates deployment/update/delete of resources based on manifest differences.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
manifest_path: Path to flash_manifest.json
|
|
361
|
+
mothership_url: Mothership endpoint URL to set on children
|
|
362
|
+
state_client: State Manager API client
|
|
363
|
+
"""
|
|
364
|
+
try:
|
|
365
|
+
# Load local manifest
|
|
366
|
+
local_manifest = load_manifest(manifest_path)
|
|
367
|
+
|
|
368
|
+
# Get persisted manifest from State Manager
|
|
369
|
+
mothership_id = os.getenv("RUNPOD_ENDPOINT_ID")
|
|
370
|
+
if not mothership_id:
|
|
371
|
+
logger.error("RUNPOD_ENDPOINT_ID not set, cannot load persisted manifest")
|
|
372
|
+
return
|
|
373
|
+
|
|
374
|
+
persisted_manifest = await state_client.get_persisted_manifest(mothership_id)
|
|
375
|
+
|
|
376
|
+
# Reconcile manifests
|
|
377
|
+
logger.info(
|
|
378
|
+
f"Starting reconciliation: {len(local_manifest.get('resources', {}))} manifest resources"
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
diff = reconcile_manifests(local_manifest, persisted_manifest)
|
|
382
|
+
|
|
383
|
+
logger.info(
|
|
384
|
+
f"Reconciliation plan: {len(diff.new)} to deploy, "
|
|
385
|
+
f"{len(diff.changed)} to update, "
|
|
386
|
+
f"{len(diff.removed)} to remove, "
|
|
387
|
+
f"{len(diff.unchanged)} unchanged"
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
manager = ResourceManager()
|
|
391
|
+
|
|
392
|
+
# Filter cached resources to prevent stale entries from being deployed
|
|
393
|
+
# This ensures resources from old codebase versions don't get redeployed
|
|
394
|
+
all_cached = manager.list_all_resources()
|
|
395
|
+
if all_cached:
|
|
396
|
+
valid_cached = filter_resources_by_manifest(all_cached, local_manifest)
|
|
397
|
+
logger.info(
|
|
398
|
+
f"Cache validation: {len(all_cached)} cached, "
|
|
399
|
+
f"{len(valid_cached)} valid, "
|
|
400
|
+
f"{len(local_manifest.get('resources', {}))} in manifest"
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
# Deploy NEW resources
|
|
404
|
+
for resource_name in diff.new:
|
|
405
|
+
try:
|
|
406
|
+
resource_data = local_manifest["resources"][resource_name]
|
|
407
|
+
config = create_resource_from_manifest(
|
|
408
|
+
resource_name, resource_data, mothership_url
|
|
409
|
+
)
|
|
410
|
+
deployed = await manager.get_or_deploy_resource(config)
|
|
411
|
+
|
|
412
|
+
# Update State Manager
|
|
413
|
+
await state_client.update_resource_state(
|
|
414
|
+
mothership_id,
|
|
415
|
+
resource_name,
|
|
416
|
+
{
|
|
417
|
+
"config_hash": compute_resource_hash(resource_data),
|
|
418
|
+
"endpoint_url": deployed.endpoint_url
|
|
419
|
+
if hasattr(deployed, "endpoint_url")
|
|
420
|
+
else deployed.url,
|
|
421
|
+
"status": "deployed",
|
|
422
|
+
},
|
|
423
|
+
)
|
|
424
|
+
logger.info(f"Deployed new resource: {resource_name}")
|
|
425
|
+
|
|
426
|
+
except Exception as e:
|
|
427
|
+
logger.error(f"Failed to deploy {resource_name}: {e}")
|
|
428
|
+
try:
|
|
429
|
+
await state_client.update_resource_state(
|
|
430
|
+
mothership_id,
|
|
431
|
+
resource_name,
|
|
432
|
+
{"status": "failed", "error": str(e)},
|
|
433
|
+
)
|
|
434
|
+
except Exception as sm_error:
|
|
435
|
+
logger.error(
|
|
436
|
+
f"Failed to update State Manager for {resource_name}: {sm_error}"
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
# Update CHANGED resources
|
|
440
|
+
for resource_name in diff.changed:
|
|
441
|
+
try:
|
|
442
|
+
resource_data = local_manifest["resources"][resource_name]
|
|
443
|
+
config = create_resource_from_manifest(
|
|
444
|
+
resource_name, resource_data, mothership_url
|
|
445
|
+
)
|
|
446
|
+
updated = await manager.get_or_deploy_resource(config)
|
|
447
|
+
|
|
448
|
+
await state_client.update_resource_state(
|
|
449
|
+
mothership_id,
|
|
450
|
+
resource_name,
|
|
451
|
+
{
|
|
452
|
+
"config_hash": compute_resource_hash(resource_data),
|
|
453
|
+
"endpoint_url": updated.endpoint_url
|
|
454
|
+
if hasattr(updated, "endpoint_url")
|
|
455
|
+
else updated.url,
|
|
456
|
+
"status": "updated",
|
|
457
|
+
},
|
|
458
|
+
)
|
|
459
|
+
logger.info(f"Updated resource: {resource_name}")
|
|
460
|
+
|
|
461
|
+
except Exception as e:
|
|
462
|
+
logger.error(f"Failed to update {resource_name}: {e}")
|
|
463
|
+
try:
|
|
464
|
+
await state_client.update_resource_state(
|
|
465
|
+
mothership_id,
|
|
466
|
+
resource_name,
|
|
467
|
+
{"status": "failed", "error": str(e)},
|
|
468
|
+
)
|
|
469
|
+
except Exception as sm_error:
|
|
470
|
+
logger.error(
|
|
471
|
+
f"Failed to update State Manager for {resource_name}: {sm_error}"
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
# Delete REMOVED resources
|
|
475
|
+
for resource_name in diff.removed:
|
|
476
|
+
try:
|
|
477
|
+
# Find resource in ResourceManager
|
|
478
|
+
matches = manager.find_resources_by_name(resource_name)
|
|
479
|
+
if matches:
|
|
480
|
+
resource_id, _ = matches[0]
|
|
481
|
+
result = await manager.undeploy_resource(resource_id, resource_name)
|
|
482
|
+
|
|
483
|
+
if result["success"]:
|
|
484
|
+
try:
|
|
485
|
+
await state_client.remove_resource_state(
|
|
486
|
+
mothership_id, resource_name
|
|
487
|
+
)
|
|
488
|
+
except Exception as sm_error:
|
|
489
|
+
logger.error(
|
|
490
|
+
f"Failed to remove {resource_name} from State Manager: {sm_error}"
|
|
491
|
+
)
|
|
492
|
+
logger.info(f"Deleted removed resource: {resource_name}")
|
|
493
|
+
else:
|
|
494
|
+
logger.error(
|
|
495
|
+
f"Failed to delete {resource_name}: {result['message']}"
|
|
496
|
+
)
|
|
497
|
+
else:
|
|
498
|
+
logger.warning(
|
|
499
|
+
f"Removed resource {resource_name} not found in ResourceManager"
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
except Exception as e:
|
|
503
|
+
logger.error(f"Failed to delete {resource_name}: {e}")
|
|
504
|
+
|
|
505
|
+
logger.info("=" * 60)
|
|
506
|
+
logger.info("Provisioning complete - All child endpoints deployed")
|
|
507
|
+
logger.info(f"Total endpoints: {len(local_manifest.get('resources', {}))}")
|
|
508
|
+
logger.info("Test phase: Manifest updated with child endpoint URLs")
|
|
509
|
+
logger.info("=" * 60)
|
|
510
|
+
|
|
511
|
+
except Exception as e:
|
|
512
|
+
logger.error(f"Provisioning failed: {e}", exc_info=True)
|