ob-metaflow-extensions 1.1.151__py2.py3-none-any.whl → 1.6.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow_extensions/outerbounds/__init__.py +1 -1
- metaflow_extensions/outerbounds/plugins/__init__.py +24 -3
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +16 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +333 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +1029 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +1300 -0
- metaflow_extensions/outerbounds/plugins/apps/core/exceptions.py +341 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +123 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +9 -77
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +7 -78
- metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +119 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +17 -3
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +1 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
- metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
- metaflow_extensions/outerbounds/plugins/nim/card.py +1 -6
- metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +32 -8
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +1 -1
- metaflow_extensions/outerbounds/plugins/ollama/__init__.py +171 -16
- metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
- metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1710 -114
- metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +49 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +37 -7
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +18 -8
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +6 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +45 -18
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +18 -9
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +10 -4
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
- metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
- metaflow_extensions/outerbounds/remote_config.py +46 -9
- metaflow_extensions/outerbounds/toplevel/apps/__init__.py +9 -0
- metaflow_extensions/outerbounds/toplevel/apps/exceptions.py +11 -0
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +86 -2
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.6.2.dist-info}/METADATA +2 -2
- ob_metaflow_extensions-1.6.2.dist-info/RECORD +136 -0
- metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
- ob_metaflow_extensions-1.1.151.dist-info/RECORD +0 -74
- {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.6.2.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.6.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1029 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import pathlib
|
|
5
|
+
import requests
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
from functools import partial
|
|
9
|
+
import shlex
|
|
10
|
+
from typing import Optional, List, Dict, Any, Tuple, Union, Callable
|
|
11
|
+
from .utils import TODOException, safe_requests_wrapper, MaximumRetriesExceeded
|
|
12
|
+
from .app_config import AppConfig, CAPSULE_DEBUG, AuthType
|
|
13
|
+
from . import experimental
|
|
14
|
+
from ._state_machine import (
|
|
15
|
+
_capsule_worker_semantic_status,
|
|
16
|
+
_capsule_worker_status_diff,
|
|
17
|
+
CapsuleWorkerSemanticStatus,
|
|
18
|
+
WorkerStatus,
|
|
19
|
+
CapsuleStatus,
|
|
20
|
+
DEPLOYMENT_READY_CONDITIONS,
|
|
21
|
+
LogLine,
|
|
22
|
+
)
|
|
23
|
+
from .exceptions import (
|
|
24
|
+
CapsuleApiException,
|
|
25
|
+
CapsuleConcurrentUpgradeException,
|
|
26
|
+
CapsuleCrashLoopException,
|
|
27
|
+
CapsuleDeletedDuringDeploymentException,
|
|
28
|
+
CapsuleDeploymentException,
|
|
29
|
+
CapsuleReadinessException,
|
|
30
|
+
OuterboundsBackendUnhealthyException,
|
|
31
|
+
OuterboundsForbiddenException,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
STATE_REFRESH_FREQUENCY = 1 # in seconds
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _format_url_string(url, is_https=True):
|
|
38
|
+
if url is None:
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
if url.startswith("http://") or url.startswith("https://"):
|
|
42
|
+
return url
|
|
43
|
+
if is_https:
|
|
44
|
+
return f"https://{url}"
|
|
45
|
+
|
|
46
|
+
return f"http://{url}"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class CapsuleStateMachine:
|
|
50
|
+
"""
|
|
51
|
+
- Every capsule create call will return a `identifier` and a `version` of the object.
|
|
52
|
+
- Each update call will return a new version.
|
|
53
|
+
- The status.currentlyServedVersion will be the version that is currently serving traffic.
|
|
54
|
+
- The status.updateInProgress will be True if an upgrade is in progress.
|
|
55
|
+
|
|
56
|
+
CapsuleState Transition:
|
|
57
|
+
- Every capsule create call will return a `identifier` and a `version` of the object.
|
|
58
|
+
- Happy Path:
|
|
59
|
+
- First time Create :
|
|
60
|
+
- wait for status.updateInProgress to be set to False
|
|
61
|
+
- (interleaved) Poll the worker endpoints to check their status
|
|
62
|
+
- showcase how many workers are coming up if things are on the cli side.
|
|
63
|
+
- If the user has set some flag like `--dont-wait-to-fully-finish` then we check the `status.currentlyServedVersion` to see if even one replica is ready to
|
|
64
|
+
serve traffic.
|
|
65
|
+
- once the status.updateInProgress is set to False, it means that the replicas are ready
|
|
66
|
+
- Upgrade:
|
|
67
|
+
- wait for status.updateInProgress to be set to False
|
|
68
|
+
- (interleaved) Poll the worker endpoints to check their status and signal the user the number replicas coming up
|
|
69
|
+
- If the user has set some flag like `--dont-wait-to-fully-finish` then we check the `status.currentlyServedVersion` to see if even one replica is ready to
|
|
70
|
+
serve traffic.
|
|
71
|
+
- Unhappy Path:
|
|
72
|
+
- First time Create :
|
|
73
|
+
- wait for status.updateInProgress to be set to False,
|
|
74
|
+
- (interleaved) Poll the workers to check their status.
|
|
75
|
+
- If the worker pertaining the current deployment instance version is crashlooping then crash the deployment process with the error messages and logs.
|
|
76
|
+
- Upgrade:
|
|
77
|
+
- wait for status.updateInProgress to be set to False,
|
|
78
|
+
- (interleaved) Poll the workers to check their status.
|
|
79
|
+
- If the worker pertaining the current deployment instance version is crashlooping then crash the deployment process with the error messages and logs.
|
|
80
|
+
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def __init__(self, capsule_id: str, current_deployment_instance_version: str):
|
|
84
|
+
self._capsule_id = capsule_id
|
|
85
|
+
self._status_trail: List[Dict[str, Any]] = []
|
|
86
|
+
self._current_deployment_instance_version = current_deployment_instance_version
|
|
87
|
+
|
|
88
|
+
def get_status_trail(self):
|
|
89
|
+
return self._status_trail
|
|
90
|
+
|
|
91
|
+
def add_status(self, status: CapsuleStatus):
|
|
92
|
+
self._status_trail.append({"timestamp": time.time(), "status": status})
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def current_status(self):
|
|
96
|
+
return self._status_trail[-1].get("status")
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def out_of_cluster_url(self):
|
|
100
|
+
access_info = self.current_status.get("accessInfo", {}) or {}
|
|
101
|
+
return _format_url_string(access_info.get("outOfClusterURL", None), True)
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def in_cluster_url(self):
|
|
105
|
+
access_info = self.current_status.get("accessInfo", {}) or {}
|
|
106
|
+
return _format_url_string(access_info.get("inClusterURL", None), True)
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def update_in_progress(self):
|
|
110
|
+
return self.current_status.get("updateInProgress", False)
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def currently_served_version(self):
|
|
114
|
+
return self.current_status.get("currentlyServedVersion", None)
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def ready_to_serve_traffic(self):
|
|
118
|
+
if self.current_status.get("readyToServeTraffic", False):
|
|
119
|
+
return any(
|
|
120
|
+
i is not None for i in [self.out_of_cluster_url, self.in_cluster_url]
|
|
121
|
+
)
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def available_replicas(self):
|
|
126
|
+
return self.current_status.get("availableReplicas", 0)
|
|
127
|
+
|
|
128
|
+
def report_current_status(self, logger):
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
def save_debug_info(self, state_dir: str):
|
|
132
|
+
debug_path = os.path.join(
|
|
133
|
+
state_dir, f"debug_capsule_sm_{self._capsule_id}.json"
|
|
134
|
+
)
|
|
135
|
+
with open(debug_path, "w") as f:
|
|
136
|
+
json.dump(self._status_trail, f, indent=4)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class CapsuleWorkersStateMachine:
|
|
140
|
+
def __init__(
|
|
141
|
+
self,
|
|
142
|
+
capsule_id: str,
|
|
143
|
+
end_state_capsule_version: str,
|
|
144
|
+
deployment_mode: str = DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING,
|
|
145
|
+
minimum_replicas: int = 1,
|
|
146
|
+
):
|
|
147
|
+
self._capsule_id = capsule_id
|
|
148
|
+
self._end_state_capsule_version = end_state_capsule_version
|
|
149
|
+
self._deployment_mode = deployment_mode
|
|
150
|
+
self._minimum_replicas = minimum_replicas
|
|
151
|
+
self._status_trail: List[Dict[str, Union[float, List[WorkerStatus]]]] = []
|
|
152
|
+
|
|
153
|
+
def get_status_trail(self):
|
|
154
|
+
return self._status_trail
|
|
155
|
+
|
|
156
|
+
def add_status(self, worker_list_response: List[WorkerStatus]):
|
|
157
|
+
"""
|
|
158
|
+
worker_list_response: List[Dict[str, Any]]
|
|
159
|
+
[
|
|
160
|
+
{
|
|
161
|
+
"workerId": "c-4pqikm-659dd9ccdc-5hcwz",
|
|
162
|
+
"phase": "Running",
|
|
163
|
+
"activity": 0,
|
|
164
|
+
"activityDataAvailable": false,
|
|
165
|
+
"version": "0xhgaewiqb"
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
"workerId": "c-4pqikm-b8559688b-xk2jh",
|
|
169
|
+
"phase": "Pending",
|
|
170
|
+
"activity": 0,
|
|
171
|
+
"activityDataAvailable": false,
|
|
172
|
+
"version": "421h48qh95"
|
|
173
|
+
}
|
|
174
|
+
]
|
|
175
|
+
"""
|
|
176
|
+
self._status_trail.append(
|
|
177
|
+
{"timestamp": time.time(), "status": worker_list_response}
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def save_debug_info(self, state_dir: str):
|
|
181
|
+
debug_path = os.path.join(
|
|
182
|
+
state_dir, f"debug_capsule_workers_{self._capsule_id}_trail.json"
|
|
183
|
+
)
|
|
184
|
+
with open(debug_path, "w") as f:
|
|
185
|
+
json.dump(self._status_trail, f, indent=4)
|
|
186
|
+
|
|
187
|
+
status_path = os.path.join(
|
|
188
|
+
state_dir, f"debug_capsule_workers_{self._capsule_id}_status.json"
|
|
189
|
+
)
|
|
190
|
+
with open(status_path, "w") as f:
|
|
191
|
+
json.dump(self.current_version_deployment_status(), f, indent=4)
|
|
192
|
+
|
|
193
|
+
def report_current_status(self, logger):
|
|
194
|
+
if len(self._status_trail) == 0:
|
|
195
|
+
return
|
|
196
|
+
older_status = None
|
|
197
|
+
if len(self._status_trail) >= 2:
|
|
198
|
+
older_status = _capsule_worker_semantic_status(
|
|
199
|
+
self._status_trail[-2].get("status"),
|
|
200
|
+
self._end_state_capsule_version,
|
|
201
|
+
self._minimum_replicas,
|
|
202
|
+
)
|
|
203
|
+
current_status = self.current_version_deployment_status()
|
|
204
|
+
changes = _capsule_worker_status_diff(current_status, older_status)
|
|
205
|
+
if len(changes) > 0:
|
|
206
|
+
logger(*changes)
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def current_status(self) -> List[WorkerStatus]:
|
|
210
|
+
return self._status_trail[-1].get("status") # type: ignore
|
|
211
|
+
|
|
212
|
+
def current_version_deployment_status(self) -> CapsuleWorkerSemanticStatus:
|
|
213
|
+
return _capsule_worker_semantic_status(
|
|
214
|
+
self.current_status, self._end_state_capsule_version, self._minimum_replicas
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def is_crashlooping(self) -> bool:
|
|
219
|
+
status = self.current_version_deployment_status()
|
|
220
|
+
return status["status"]["at_least_one_crashlooping"]
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class CapsuleInput:
|
|
224
|
+
@classmethod
|
|
225
|
+
def construct_exec_command(cls, commands: List[str]):
|
|
226
|
+
commands = ["set -eEuo pipefail"] + commands
|
|
227
|
+
command_string = "\n".join(commands)
|
|
228
|
+
# First construct a base64 encoded string of the quoted command
|
|
229
|
+
# One of the reasons we don't directly pass the command string to the backend with a `\n` join
|
|
230
|
+
# is because the backend controller doesn't play nice when the command can be a multi-line string.
|
|
231
|
+
# So we encode it to a base64 string and then decode it back to a command string at runtime to provide to
|
|
232
|
+
# `bash -c`. The ideal thing to have done is to run "bash -c {shlex.quote(command_string)}" and call it a day
|
|
233
|
+
# but the backend controller yields the following error:
|
|
234
|
+
# `error parsing template: error converting YAML to JSON: yaml: line 111: mapping values are not allowed in this context`
|
|
235
|
+
# So we go to great length to ensure the command is provided in base64 to avoid any issues with the backend controller.
|
|
236
|
+
import base64
|
|
237
|
+
|
|
238
|
+
encoded_command = base64.b64encode(command_string.encode()).decode()
|
|
239
|
+
decode_cmd = f"echo {encoded_command} | base64 -d > ./_ob_app_run.sh"
|
|
240
|
+
return (
|
|
241
|
+
f"bash -c '{decode_cmd} && cat ./_ob_app_run.sh && bash ./_ob_app_run.sh'"
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
@classmethod
|
|
245
|
+
def _marshal_environment_variables(cls, app_config: AppConfig):
|
|
246
|
+
envs = app_config.get_state("environment", {}).copy()
|
|
247
|
+
_return = []
|
|
248
|
+
for k, v in envs.items():
|
|
249
|
+
_v = v
|
|
250
|
+
if isinstance(v, dict):
|
|
251
|
+
_v = json.dumps(v)
|
|
252
|
+
elif isinstance(v, list):
|
|
253
|
+
_v = json.dumps(v)
|
|
254
|
+
else:
|
|
255
|
+
_v = str(v)
|
|
256
|
+
_return.append(
|
|
257
|
+
{
|
|
258
|
+
"name": k,
|
|
259
|
+
"value": _v,
|
|
260
|
+
}
|
|
261
|
+
)
|
|
262
|
+
return _return
|
|
263
|
+
|
|
264
|
+
@classmethod
|
|
265
|
+
def from_app_config(cls, app_config: AppConfig):
|
|
266
|
+
## Replica settings
|
|
267
|
+
replicas = app_config.get_state("replicas", {})
|
|
268
|
+
fixed, _min, _max = (
|
|
269
|
+
replicas.get("fixed"),
|
|
270
|
+
replicas.get("min"),
|
|
271
|
+
replicas.get("max"),
|
|
272
|
+
)
|
|
273
|
+
rpm = replicas.get("scaling_policy", {}).get("rpm", None)
|
|
274
|
+
autoscaling_config = {}
|
|
275
|
+
if rpm:
|
|
276
|
+
autoscaling_config = {
|
|
277
|
+
"requestRateBasedAutoscalingConfig": {"targetRequestsPerMinute": rpm}
|
|
278
|
+
}
|
|
279
|
+
if fixed is not None:
|
|
280
|
+
_min, _max = fixed, fixed
|
|
281
|
+
gpu_resource = app_config.get_state("resources").get("gpu")
|
|
282
|
+
resources = {}
|
|
283
|
+
shared_memory = app_config.get_state("resources").get("shared_memory")
|
|
284
|
+
if gpu_resource:
|
|
285
|
+
resources["gpu"] = gpu_resource
|
|
286
|
+
if shared_memory:
|
|
287
|
+
resources["sharedMemory"] = shared_memory
|
|
288
|
+
|
|
289
|
+
_scheduling_config = {}
|
|
290
|
+
if app_config.get_state("compute_pools", None):
|
|
291
|
+
_scheduling_config["schedulingConfig"] = {
|
|
292
|
+
"computePools": [
|
|
293
|
+
{"name": x} for x in app_config.get_state("compute_pools")
|
|
294
|
+
]
|
|
295
|
+
}
|
|
296
|
+
_description = app_config.get_state("description")
|
|
297
|
+
_app_type = app_config.get_state("app_type")
|
|
298
|
+
_final_info = {}
|
|
299
|
+
if _description:
|
|
300
|
+
_final_info["description"] = _description
|
|
301
|
+
if _app_type:
|
|
302
|
+
_final_info["endpointType"] = _app_type
|
|
303
|
+
return {
|
|
304
|
+
"perimeter": app_config.get_state("perimeter"),
|
|
305
|
+
**_final_info,
|
|
306
|
+
"codePackagePath": app_config.get_state("code_package_url"),
|
|
307
|
+
"image": app_config.get_state("image"),
|
|
308
|
+
"resourceIntegrations": [
|
|
309
|
+
{"name": x} for x in app_config.get_state("secrets", [])
|
|
310
|
+
],
|
|
311
|
+
"resourceConfig": {
|
|
312
|
+
"cpu": str(app_config.get_state("resources").get("cpu")),
|
|
313
|
+
"memory": str(app_config.get_state("resources").get("memory")),
|
|
314
|
+
"ephemeralStorage": str(app_config.get_state("resources").get("disk")),
|
|
315
|
+
**resources,
|
|
316
|
+
},
|
|
317
|
+
"autoscalingConfig": {
|
|
318
|
+
"minReplicas": _min,
|
|
319
|
+
"maxReplicas": _max,
|
|
320
|
+
**autoscaling_config,
|
|
321
|
+
},
|
|
322
|
+
**_scheduling_config,
|
|
323
|
+
"containerStartupConfig": {
|
|
324
|
+
"entrypoint": cls.construct_exec_command(
|
|
325
|
+
app_config.get_state("commands")
|
|
326
|
+
)
|
|
327
|
+
},
|
|
328
|
+
"environmentVariables": cls._marshal_environment_variables(app_config),
|
|
329
|
+
# "assets": [{"name": "startup-script.sh"}],
|
|
330
|
+
"authConfig": {
|
|
331
|
+
"authType": app_config.get_state("auth").get("type"),
|
|
332
|
+
"publicToDeployment": app_config.get_state("auth").get("public"),
|
|
333
|
+
},
|
|
334
|
+
"tags": [
|
|
335
|
+
dict(key=k, value=v)
|
|
336
|
+
for tag in app_config.get_state("tags", [])
|
|
337
|
+
for k, v in tag.items()
|
|
338
|
+
],
|
|
339
|
+
"port": app_config.get_state("port"),
|
|
340
|
+
"displayName": app_config.get_state("name"),
|
|
341
|
+
"forceUpdate": app_config.get_state("force_upgrade", False),
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
class CapsuleApi:
|
|
346
|
+
def __init__(self, base_url: str, perimeter: str, logger_fn=None, retry_500s=False):
|
|
347
|
+
self._base_url = self._create_base_url(base_url, perimeter)
|
|
348
|
+
from metaflow.metaflow_config import SERVICE_HEADERS
|
|
349
|
+
|
|
350
|
+
self._retry_500s = retry_500s
|
|
351
|
+
self._logger_fn = logger_fn
|
|
352
|
+
self._request_headers = {
|
|
353
|
+
**{"Content-Type": "application/json", "Connection": "keep-alive"},
|
|
354
|
+
**(SERVICE_HEADERS or {}),
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
@staticmethod
|
|
358
|
+
def _create_base_url(base_url: str, perimeter: str):
|
|
359
|
+
return os.path.join(
|
|
360
|
+
base_url,
|
|
361
|
+
"v1",
|
|
362
|
+
"perimeters",
|
|
363
|
+
perimeter,
|
|
364
|
+
"capsules",
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
def _wrapped_api_caller(self, method_func, *args, **kwargs):
|
|
368
|
+
try:
|
|
369
|
+
response = safe_requests_wrapper(
|
|
370
|
+
method_func,
|
|
371
|
+
*args,
|
|
372
|
+
headers=self._request_headers,
|
|
373
|
+
logger_fn=self._logger_fn,
|
|
374
|
+
**kwargs,
|
|
375
|
+
)
|
|
376
|
+
# The CapsuleApi wraps every API call happening to the capsule
|
|
377
|
+
# API. We do this so that we can raise exceptions in a way that make
|
|
378
|
+
# it clearer to the end-user and the operator. since the safe_requests_wrapper
|
|
379
|
+
# can already retry 5xx errors too we should ensure that any time we hit max
|
|
380
|
+
# retries or if we hit 5xx without retries, we should raise a "special" exception
|
|
381
|
+
# and not the CapsuleApiException to notify the operator that the
|
|
382
|
+
# backend is not working right RN and thier application crashes. We can lift the
|
|
383
|
+
# exception to top level to make it importable so operators can deal with that condition
|
|
384
|
+
# how they like.
|
|
385
|
+
except MaximumRetriesExceeded as e:
|
|
386
|
+
if e.status_code >= 500:
|
|
387
|
+
raise OuterboundsBackendUnhealthyException(
|
|
388
|
+
e.url,
|
|
389
|
+
e.method,
|
|
390
|
+
e.status_code,
|
|
391
|
+
e.text,
|
|
392
|
+
)
|
|
393
|
+
raise CapsuleApiException(
|
|
394
|
+
e.url,
|
|
395
|
+
e.method,
|
|
396
|
+
e.status_code,
|
|
397
|
+
e.text,
|
|
398
|
+
message=f"Maximum retries exceeded for {e.url} [{e.method}]",
|
|
399
|
+
)
|
|
400
|
+
except requests.exceptions.ConnectionError as e:
|
|
401
|
+
# Network connectivity issues after retries exhausted
|
|
402
|
+
raise OuterboundsBackendUnhealthyException(
|
|
403
|
+
url=args[0] if args else "unknown",
|
|
404
|
+
method=method_func.__name__,
|
|
405
|
+
message=(
|
|
406
|
+
f"Unable to reach Outerbounds backend at {args[0] if args else 'unknown'}. "
|
|
407
|
+
"This could be due to network connectivity issues, DNS resolution failures, "
|
|
408
|
+
"or the service being temporarily unavailable. "
|
|
409
|
+
"Please check your network connection and retry. "
|
|
410
|
+
"If the issue persists, contact Outerbounds support."
|
|
411
|
+
),
|
|
412
|
+
) from e
|
|
413
|
+
|
|
414
|
+
if response.status_code >= 500:
|
|
415
|
+
raise OuterboundsBackendUnhealthyException(
|
|
416
|
+
args[0],
|
|
417
|
+
method_func.__name__,
|
|
418
|
+
response.status_code,
|
|
419
|
+
response.text,
|
|
420
|
+
message=(
|
|
421
|
+
f"Outerbounds backend returned an error (HTTP {response.status_code}). "
|
|
422
|
+
"This is a server-side issue, not a problem with your configuration. "
|
|
423
|
+
"Please retry your request. If the issue persists, contact Outerbounds support."
|
|
424
|
+
),
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
elif response.status_code == 403:
|
|
428
|
+
raise OuterboundsForbiddenException(
|
|
429
|
+
args[0],
|
|
430
|
+
method_func.__name__,
|
|
431
|
+
response.text,
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
elif response.status_code >= 400:
|
|
435
|
+
raise CapsuleApiException(
|
|
436
|
+
args[0],
|
|
437
|
+
method_func.__name__,
|
|
438
|
+
response.status_code,
|
|
439
|
+
response.text,
|
|
440
|
+
)
|
|
441
|
+
return response
|
|
442
|
+
|
|
443
|
+
def _retry_parameters(
|
|
444
|
+
self,
|
|
445
|
+
status_codes,
|
|
446
|
+
retries,
|
|
447
|
+
):
|
|
448
|
+
"""
|
|
449
|
+
All functions calling the wrapped_api_caller use this function
|
|
450
|
+
set the number of retries for the apis calls. It sets status codes
|
|
451
|
+
that are allowed to N retries (total including connection retries).
|
|
452
|
+
If no status codes are passed we should still always pass connnection
|
|
453
|
+
retries > 0 since DNS can be flaky in-frequently and we dont want to
|
|
454
|
+
trip up there.
|
|
455
|
+
"""
|
|
456
|
+
kwargs = {}
|
|
457
|
+
if self._retry_500s:
|
|
458
|
+
kwargs = dict(
|
|
459
|
+
retryable_status_codes=status_codes
|
|
460
|
+
+ [500, 502, 503, 504], # todo : verify me
|
|
461
|
+
conn_error_retries=max(
|
|
462
|
+
3, retries
|
|
463
|
+
), # connection retries + any other retries.
|
|
464
|
+
)
|
|
465
|
+
else:
|
|
466
|
+
kwargs = dict(
|
|
467
|
+
retryable_status_codes=status_codes,
|
|
468
|
+
conn_error_retries=retries, # connection retries + any other retries.
|
|
469
|
+
)
|
|
470
|
+
return kwargs
|
|
471
|
+
|
|
472
|
+
def create(self, capsule_input: dict):
|
|
473
|
+
_data = json.dumps(capsule_input)
|
|
474
|
+
response = self._wrapped_api_caller(
|
|
475
|
+
requests.post, self._base_url, data=_data, **self._retry_parameters([], 3)
|
|
476
|
+
)
|
|
477
|
+
try:
|
|
478
|
+
return response.json()
|
|
479
|
+
except json.JSONDecodeError as e:
|
|
480
|
+
raise CapsuleApiException(
|
|
481
|
+
self._base_url,
|
|
482
|
+
"post",
|
|
483
|
+
response.status_code,
|
|
484
|
+
response.text,
|
|
485
|
+
message="Capsule JSON decode failed",
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
def get(self, capsule_id: str) -> Dict[str, Any]:
|
|
489
|
+
_url = os.path.join(self._base_url, capsule_id)
|
|
490
|
+
response = self._wrapped_api_caller(
|
|
491
|
+
requests.get, _url, **self._retry_parameters([409, 404], 3)
|
|
492
|
+
)
|
|
493
|
+
try:
|
|
494
|
+
return response.json()
|
|
495
|
+
except json.JSONDecodeError as e:
|
|
496
|
+
raise CapsuleApiException(
|
|
497
|
+
_url,
|
|
498
|
+
"get",
|
|
499
|
+
response.status_code,
|
|
500
|
+
response.text,
|
|
501
|
+
message="Capsule JSON decode failed",
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
# TODO: refactor me since name *currently(9/8/25)* is unique across capsules.
|
|
505
|
+
def get_by_name(self, name: str, most_recent_only: bool = True):
|
|
506
|
+
_url = os.path.join(self._base_url, f"?displayName={name}")
|
|
507
|
+
response = self._wrapped_api_caller(
|
|
508
|
+
requests.get, _url, **self._retry_parameters([409], 3)
|
|
509
|
+
)
|
|
510
|
+
try:
|
|
511
|
+
if most_recent_only:
|
|
512
|
+
result = response.json()
|
|
513
|
+
candidates = result["capsules"]
|
|
514
|
+
if not candidates:
|
|
515
|
+
return None
|
|
516
|
+
return sorted(
|
|
517
|
+
candidates, key=lambda x: x["metadata"]["createdAt"], reverse=True
|
|
518
|
+
)[0]
|
|
519
|
+
else:
|
|
520
|
+
return response.json()
|
|
521
|
+
except json.JSONDecodeError as e:
|
|
522
|
+
raise CapsuleApiException(
|
|
523
|
+
_url,
|
|
524
|
+
"get",
|
|
525
|
+
response.status_code,
|
|
526
|
+
response.text,
|
|
527
|
+
message="Capsule JSON decode failed",
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
def list(self):
|
|
531
|
+
response = self._wrapped_api_caller(
|
|
532
|
+
requests.get, self._base_url, **self._retry_parameters([409], 3)
|
|
533
|
+
)
|
|
534
|
+
try:
|
|
535
|
+
response_json = response.json()
|
|
536
|
+
except json.JSONDecodeError as e:
|
|
537
|
+
raise CapsuleApiException(
|
|
538
|
+
self._base_url,
|
|
539
|
+
"get",
|
|
540
|
+
response.status_code,
|
|
541
|
+
response.text,
|
|
542
|
+
message="Capsule JSON decode failed",
|
|
543
|
+
)
|
|
544
|
+
if "capsules" not in response_json:
|
|
545
|
+
raise CapsuleApiException(
|
|
546
|
+
self._base_url,
|
|
547
|
+
"get",
|
|
548
|
+
response.status_code,
|
|
549
|
+
response.text,
|
|
550
|
+
message="Capsule JSON decode failed",
|
|
551
|
+
)
|
|
552
|
+
return response_json.get("capsules", []) or []
|
|
553
|
+
|
|
554
|
+
def delete(self, capsule_id: str):
|
|
555
|
+
_url = os.path.join(self._base_url, capsule_id)
|
|
556
|
+
response = self._wrapped_api_caller(
|
|
557
|
+
requests.delete, _url, **self._retry_parameters([409], 3)
|
|
558
|
+
)
|
|
559
|
+
if response.status_code >= 400:
|
|
560
|
+
raise CapsuleApiException(
|
|
561
|
+
_url,
|
|
562
|
+
"delete",
|
|
563
|
+
response.status_code,
|
|
564
|
+
response.text,
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
if response.status_code == 200:
|
|
568
|
+
return True
|
|
569
|
+
return False
|
|
570
|
+
|
|
571
|
+
def get_workers(self, capsule_id: str) -> List[Dict[str, Any]]:
|
|
572
|
+
_url = os.path.join(self._base_url, capsule_id, "workers")
|
|
573
|
+
response = self._wrapped_api_caller(
|
|
574
|
+
requests.get,
|
|
575
|
+
_url,
|
|
576
|
+
# Adding 404s because sometimes we might even end up getting 404s if
|
|
577
|
+
# the backend cache is not updated yet. So on consistent 404s we should
|
|
578
|
+
# just crash out.
|
|
579
|
+
**self._retry_parameters([409, 404], 3),
|
|
580
|
+
)
|
|
581
|
+
try:
|
|
582
|
+
return response.json().get("workers", []) or []
|
|
583
|
+
except json.JSONDecodeError as e:
|
|
584
|
+
raise CapsuleApiException(
|
|
585
|
+
_url,
|
|
586
|
+
"get",
|
|
587
|
+
response.status_code,
|
|
588
|
+
response.text,
|
|
589
|
+
message="Capsule JSON decode failed",
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
def logs(
|
|
593
|
+
self, capsule_id: str, worker_id: str, previous: bool = False
|
|
594
|
+
) -> List[LogLine]:
|
|
595
|
+
_url = os.path.join(self._base_url, capsule_id, "workers", worker_id, "logs")
|
|
596
|
+
options = None
|
|
597
|
+
if previous:
|
|
598
|
+
options = {"previous": True}
|
|
599
|
+
response = self._wrapped_api_caller(
|
|
600
|
+
requests.get, _url, params=options, **self._retry_parameters([409], 3)
|
|
601
|
+
)
|
|
602
|
+
try:
|
|
603
|
+
return response.json().get("logs", []) or []
|
|
604
|
+
except json.JSONDecodeError as e:
|
|
605
|
+
raise CapsuleApiException(
|
|
606
|
+
_url,
|
|
607
|
+
"get",
|
|
608
|
+
response.status_code,
|
|
609
|
+
response.text,
|
|
610
|
+
message="Capsule JSON decode failed",
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
def patch(self, capsule_id: str, patch_input: dict):
|
|
614
|
+
capsule_response = self.get(capsule_id)
|
|
615
|
+
if "spec" not in capsule_response or len(capsule_response.get("spec", {})) == 0:
|
|
616
|
+
raise CapsuleApiException(
|
|
617
|
+
self._base_url,
|
|
618
|
+
"patch",
|
|
619
|
+
403,
|
|
620
|
+
"Capsule response of incorrect format",
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
spec = capsule_response.get("spec")
|
|
624
|
+
spec.update(patch_input)
|
|
625
|
+
return self.create(spec)
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def list_and_filter_capsules(
|
|
629
|
+
capsule_api: CapsuleApi, project, branch, name, tags, auth_type, capsule_id
|
|
630
|
+
):
|
|
631
|
+
capsules = capsule_api.list()
|
|
632
|
+
|
|
633
|
+
def _tags_match(tags, key, value):
|
|
634
|
+
for t in tags:
|
|
635
|
+
if t["key"] == key and t["value"] == value:
|
|
636
|
+
return True
|
|
637
|
+
return False
|
|
638
|
+
|
|
639
|
+
def _all_tags_match(tags, tags_to_match):
|
|
640
|
+
return all([_tags_match(tags, t["key"], t["value"]) for t in tags_to_match])
|
|
641
|
+
|
|
642
|
+
def _filter_capsules(capsules, project, branch, name, tags, auth_type, capsule_id):
|
|
643
|
+
_filtered_capsules = []
|
|
644
|
+
for capsule in capsules:
|
|
645
|
+
set_tags = capsule.get("spec", {}).get("tags", [])
|
|
646
|
+
display_name = capsule.get("spec", {}).get("displayName", None)
|
|
647
|
+
set_id = capsule.get("id", None)
|
|
648
|
+
set_auth_type = (
|
|
649
|
+
capsule.get("spec", {}).get("authConfig", {}).get("authType", None)
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
if auth_type and set_auth_type != auth_type:
|
|
653
|
+
continue
|
|
654
|
+
if project and not _tags_match(set_tags, "project", project):
|
|
655
|
+
continue
|
|
656
|
+
if branch and not _tags_match(set_tags, "branch", branch):
|
|
657
|
+
continue
|
|
658
|
+
if name and display_name != name:
|
|
659
|
+
continue
|
|
660
|
+
if tags and not _all_tags_match(set_tags, tags):
|
|
661
|
+
continue
|
|
662
|
+
if capsule_id and set_id != capsule_id:
|
|
663
|
+
continue
|
|
664
|
+
|
|
665
|
+
_filtered_capsules.append(capsule)
|
|
666
|
+
return _filtered_capsules
|
|
667
|
+
|
|
668
|
+
return _filter_capsules(
|
|
669
|
+
capsules, project, branch, name, tags, auth_type, capsule_id
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
from collections import namedtuple
|
|
674
|
+
|
|
675
|
+
CapsuleInfo = namedtuple("CapsuleInfo", ["info", "workers"])
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
class CapsuleDeployer:
|
|
679
|
+
|
|
680
|
+
status: CapsuleStateMachine
|
|
681
|
+
|
|
682
|
+
identifier = None
|
|
683
|
+
|
|
684
|
+
# TODO: Current default timeout is very large of 5 minutes. Ideally we should have finished the deployed in less than 1 minutes.
|
|
685
|
+
def __init__(
|
|
686
|
+
self,
|
|
687
|
+
app_config: AppConfig,
|
|
688
|
+
base_url: str,
|
|
689
|
+
create_timeout: int = 60 * 5,
|
|
690
|
+
debug_dir: Optional[str] = None,
|
|
691
|
+
success_terminal_state_condition: str = DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING,
|
|
692
|
+
readiness_wait_time: int = 20,
|
|
693
|
+
logger_fn=None,
|
|
694
|
+
):
|
|
695
|
+
self._app_config = app_config
|
|
696
|
+
self._capsule_api = CapsuleApi(
|
|
697
|
+
base_url,
|
|
698
|
+
app_config.get_state("perimeter"),
|
|
699
|
+
logger_fn=logger_fn or partial(print, file=sys.stderr),
|
|
700
|
+
retry_500s=True
|
|
701
|
+
# retry for 5xx because during the capsule deployer might even be used
|
|
702
|
+
# programmatically so any intermittent breakage shouldnt break the overall
|
|
703
|
+
# control flow unless the breakage is severe (maybe over 20s of complete outage)
|
|
704
|
+
)
|
|
705
|
+
self._create_timeout = create_timeout
|
|
706
|
+
self._logger_fn = logger_fn
|
|
707
|
+
self._debug_dir = debug_dir
|
|
708
|
+
self._capsule_deploy_response = None
|
|
709
|
+
self._success_terminal_state_condition = success_terminal_state_condition
|
|
710
|
+
self._readiness_wait_time = readiness_wait_time
|
|
711
|
+
|
|
712
|
+
@property
|
|
713
|
+
def url(self):
|
|
714
|
+
return _format_url_string(
|
|
715
|
+
({} or self._capsule_deploy_response).get("outOfClusterUrl", None), True
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
@property
|
|
719
|
+
def capsule_api(self):
|
|
720
|
+
return self._capsule_api
|
|
721
|
+
|
|
722
|
+
@property
|
|
723
|
+
def capsule_type(self):
|
|
724
|
+
auth_type = self._app_config.get_state("auth", {}).get("type", AuthType.default)
|
|
725
|
+
if auth_type == AuthType.BROWSER:
|
|
726
|
+
return "App"
|
|
727
|
+
elif auth_type == AuthType.API or auth_type == AuthType.BROWSER_AND_API:
|
|
728
|
+
return "Endpoint"
|
|
729
|
+
else:
|
|
730
|
+
raise TODOException(f"Unknown auth type: {auth_type}")
|
|
731
|
+
|
|
732
|
+
@property
|
|
733
|
+
def name(self):
|
|
734
|
+
return self._app_config.get_state("name")
|
|
735
|
+
|
|
736
|
+
def create_input(self):
|
|
737
|
+
return experimental.capsule_input_overrides(
|
|
738
|
+
self._app_config, CapsuleInput.from_app_config(self._app_config)
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
@property
|
|
742
|
+
def current_deployment_instance_version(self):
|
|
743
|
+
"""
|
|
744
|
+
The backend `create` call returns a version of the object that will be
|
|
745
|
+
"""
|
|
746
|
+
if self._capsule_deploy_response is None:
|
|
747
|
+
return None
|
|
748
|
+
return self._capsule_deploy_response.get("version", None)
|
|
749
|
+
|
|
750
|
+
def create(self):
|
|
751
|
+
capsule_response = self._capsule_api.create(self.create_input())
|
|
752
|
+
self.identifier = capsule_response.get("id")
|
|
753
|
+
self._capsule_deploy_response = capsule_response
|
|
754
|
+
return self.identifier
|
|
755
|
+
|
|
756
|
+
def get(self):
|
|
757
|
+
return self._capsule_api.get(self.identifier)
|
|
758
|
+
|
|
759
|
+
def get_workers(self):
|
|
760
|
+
return self._capsule_api.get_workers(self.identifier)
|
|
761
|
+
|
|
762
|
+
def _backend_version_mismatch_check(
|
|
763
|
+
self, capsule_response: dict, current_deployment_instance_version: str
|
|
764
|
+
):
|
|
765
|
+
"""
|
|
766
|
+
- `capsule_response.version` contains the version of the object present in the database
|
|
767
|
+
- `current_deployment_instance_version` contains the version of the object that was deployed by this instance of the deployer.
|
|
768
|
+
In the situation that the versions of the objects become a mismatch then it means that current deployment process is not giving the user the
|
|
769
|
+
output that they desire.
|
|
770
|
+
"""
|
|
771
|
+
if capsule_response.get("version", None) != current_deployment_instance_version:
|
|
772
|
+
metadata = capsule_response.get("metadata", {}) or {}
|
|
773
|
+
raise CapsuleConcurrentUpgradeException(
|
|
774
|
+
self.identifier, # type: ignore
|
|
775
|
+
expected_version=current_deployment_instance_version,
|
|
776
|
+
actual_version=capsule_response.get("version", None),
|
|
777
|
+
modified_by=metadata.get("lastModifiedBy"),
|
|
778
|
+
modified_at=metadata.get("lastModifiedAt"),
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
def _update_capsule_and_worker_sm(
|
|
782
|
+
self,
|
|
783
|
+
capsule_sm: "CapsuleStateMachine",
|
|
784
|
+
workers_sm: "CapsuleWorkersStateMachine",
|
|
785
|
+
logger: Callable[[str], None],
|
|
786
|
+
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
|
787
|
+
try:
|
|
788
|
+
capsule_response = self.get()
|
|
789
|
+
except CapsuleApiException as e:
|
|
790
|
+
# At this point when the code is executing
|
|
791
|
+
# the CapsuleDeployer would already have created
|
|
792
|
+
# the capsule since this function is called within
|
|
793
|
+
# wait_for_terminal_state. Because of that if there
|
|
794
|
+
# is now a 404 then it means someone deleted the
|
|
795
|
+
# deployment WHILE this deployment instance is running
|
|
796
|
+
# We shoud notify the user that something funky has
|
|
797
|
+
# happened over here. We need to do this since Apps can
|
|
798
|
+
# now be programmatically created / deleted, we need to
|
|
799
|
+
# ensure that if some-one has done something concurrent-unsafe
|
|
800
|
+
# (foo deleting bar's deployment while bar is deploying)
|
|
801
|
+
# then for that circumstance we should raise an exception here
|
|
802
|
+
# that something funky has happened. Otherwise if the
|
|
803
|
+
# CapsuleApiException leaks out then it should be fine.
|
|
804
|
+
if e.status_code == 404:
|
|
805
|
+
raise CapsuleDeletedDuringDeploymentException(self.identifier) from e
|
|
806
|
+
raise
|
|
807
|
+
capsule_sm.add_status(capsule_response.get("status", {})) # type: ignore
|
|
808
|
+
|
|
809
|
+
# We need to check if someone has not upgraded the capsule under the hood and
|
|
810
|
+
# the current deployment instance is invalid.
|
|
811
|
+
self._backend_version_mismatch_check(
|
|
812
|
+
capsule_response, self.current_deployment_instance_version # type: ignore
|
|
813
|
+
)
|
|
814
|
+
workers_response = self.get_workers()
|
|
815
|
+
capsule_sm.report_current_status(logger)
|
|
816
|
+
workers_sm.add_status(workers_response)
|
|
817
|
+
workers_sm.report_current_status(logger)
|
|
818
|
+
return capsule_response, workers_response
|
|
819
|
+
|
|
820
|
+
def _publish_capsule_debug_info(
|
|
821
|
+
self,
|
|
822
|
+
capsule_sm: "CapsuleStateMachine",
|
|
823
|
+
workers_sm: "CapsuleWorkersStateMachine",
|
|
824
|
+
capsule_response: Dict[str, Any],
|
|
825
|
+
):
|
|
826
|
+
if CAPSULE_DEBUG and self._debug_dir:
|
|
827
|
+
capsule_sm.save_debug_info(self._debug_dir)
|
|
828
|
+
workers_sm.save_debug_info(self._debug_dir)
|
|
829
|
+
debug_path = os.path.join(
|
|
830
|
+
self._debug_dir, f"debug_capsule_{self.identifier}.json"
|
|
831
|
+
)
|
|
832
|
+
with open(debug_path, "w") as f:
|
|
833
|
+
f.write(json.dumps(capsule_response, indent=4))
|
|
834
|
+
|
|
835
|
+
def _monitor_worker_readiness(
|
|
836
|
+
self,
|
|
837
|
+
workers_sm: "CapsuleWorkersStateMachine",
|
|
838
|
+
capsule_sm: "CapsuleStateMachine",
|
|
839
|
+
):
|
|
840
|
+
"""returns True if the worker is crashlooping, False otherwise"""
|
|
841
|
+
logger = self._logger_fn or partial(print, file=sys.stderr)
|
|
842
|
+
for i in range(self._readiness_wait_time):
|
|
843
|
+
time.sleep(STATE_REFRESH_FREQUENCY)
|
|
844
|
+
self._update_capsule_and_worker_sm(
|
|
845
|
+
capsule_sm, workers_sm, logger
|
|
846
|
+
) # [2 API calls]
|
|
847
|
+
if workers_sm.is_crashlooping:
|
|
848
|
+
return True
|
|
849
|
+
return False
|
|
850
|
+
|
|
851
|
+
def _extract_logs_from_crashlooping_worker(
|
|
852
|
+
self, workers_sm: "CapsuleWorkersStateMachine"
|
|
853
|
+
):
|
|
854
|
+
def _extract_worker_id_of_crashlooping_worker(
|
|
855
|
+
workers_status: List[WorkerStatus],
|
|
856
|
+
):
|
|
857
|
+
for worker in workers_status:
|
|
858
|
+
if worker["phase"] == "CrashLoopBackOff" or worker["phase"] == "Failed":
|
|
859
|
+
return worker["workerId"]
|
|
860
|
+
return None
|
|
861
|
+
|
|
862
|
+
worker_id = _extract_worker_id_of_crashlooping_worker(workers_sm.current_status)
|
|
863
|
+
if worker_id is None:
|
|
864
|
+
return None, None
|
|
865
|
+
logs = self.capsule_api.logs(self.identifier, worker_id, previous=True)
|
|
866
|
+
return logs, worker_id
|
|
867
|
+
|
|
868
|
+
def _get_min_replicas(self):
|
|
869
|
+
replicas = self._app_config.get_state("replicas", {})
|
|
870
|
+
fixed, _min, _ = replicas.get("fixed"), replicas.get("min"), replicas.get("max")
|
|
871
|
+
if fixed is not None:
|
|
872
|
+
return fixed
|
|
873
|
+
return _min
|
|
874
|
+
|
|
875
|
+
def wait_for_terminal_state(
|
|
876
|
+
self,
|
|
877
|
+
):
|
|
878
|
+
""" """
|
|
879
|
+
logger = self._logger_fn or partial(print, file=sys.stderr)
|
|
880
|
+
state_machine = CapsuleStateMachine(
|
|
881
|
+
self.identifier, self.current_deployment_instance_version
|
|
882
|
+
)
|
|
883
|
+
# min_replicas will always be present
|
|
884
|
+
min_replicas = self._get_min_replicas()
|
|
885
|
+
workers_state_machine = CapsuleWorkersStateMachine(
|
|
886
|
+
self.identifier,
|
|
887
|
+
self.current_deployment_instance_version,
|
|
888
|
+
deployment_mode=self._success_terminal_state_condition,
|
|
889
|
+
minimum_replicas=min_replicas,
|
|
890
|
+
)
|
|
891
|
+
self.status = state_machine
|
|
892
|
+
|
|
893
|
+
# This loop will check all the conditions that help verify the terminal state.
|
|
894
|
+
# How it works is by extracting the statuses of the capsule and workers and
|
|
895
|
+
# then adding them as a part of a state-machine that helps track transitions and
|
|
896
|
+
# helps derive terminal states.
|
|
897
|
+
# We will first keep checking for terminal conditions or outright failure conditions
|
|
898
|
+
# If we reach a teminal condition like described in `DEPLOYMENT_READY_CONDITIONS`, then
|
|
899
|
+
# we will further check for readiness conditions.
|
|
900
|
+
for i in range(self._create_timeout):
|
|
901
|
+
time.sleep(STATE_REFRESH_FREQUENCY)
|
|
902
|
+
capsule_response, _ = self._update_capsule_and_worker_sm( # [2 API calls]
|
|
903
|
+
state_machine, workers_state_machine, logger
|
|
904
|
+
)
|
|
905
|
+
# Deployment readiness checks will determine what is the terminal state
|
|
906
|
+
# of the workerstate machine. If we detect a terminal state in the workers,
|
|
907
|
+
# then even if the capsule upgrade is still in progress we will end up crashing
|
|
908
|
+
# the deployment.
|
|
909
|
+
(
|
|
910
|
+
capsule_ready,
|
|
911
|
+
further_check_worker_readiness,
|
|
912
|
+
) = DEPLOYMENT_READY_CONDITIONS.check_readiness_condition(
|
|
913
|
+
state_machine.current_status,
|
|
914
|
+
workers_state_machine.current_version_deployment_status(),
|
|
915
|
+
self._success_terminal_state_condition,
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
failure_condition_satisfied = (
|
|
919
|
+
DEPLOYMENT_READY_CONDITIONS.check_failure_condition(
|
|
920
|
+
state_machine.current_status,
|
|
921
|
+
workers_state_machine.current_version_deployment_status(),
|
|
922
|
+
)
|
|
923
|
+
)
|
|
924
|
+
if capsule_ready or failure_condition_satisfied:
|
|
925
|
+
logger(
|
|
926
|
+
"💊 %s deployment status: %s "
|
|
927
|
+
% (
|
|
928
|
+
self.capsule_type.title(),
|
|
929
|
+
(
|
|
930
|
+
"in progress"
|
|
931
|
+
if state_machine.update_in_progress
|
|
932
|
+
else "completed"
|
|
933
|
+
),
|
|
934
|
+
)
|
|
935
|
+
)
|
|
936
|
+
_further_readiness_check_failed = False
|
|
937
|
+
if further_check_worker_readiness:
|
|
938
|
+
# HACK : monitor the workers for N seconds to make sure they are healthy
|
|
939
|
+
# this is a hack. Ideally we should implement a healthcheck as a first class citizen
|
|
940
|
+
# but it will take some time to do that so in the meanwhile a timeout set on the cli
|
|
941
|
+
# side will be really helpful.
|
|
942
|
+
logger(
|
|
943
|
+
"💊 Running last minute readiness check for %s..."
|
|
944
|
+
% self.identifier
|
|
945
|
+
)
|
|
946
|
+
_further_readiness_check_failed = self._monitor_worker_readiness(
|
|
947
|
+
workers_state_machine,
|
|
948
|
+
state_machine,
|
|
949
|
+
)
|
|
950
|
+
|
|
951
|
+
if CAPSULE_DEBUG:
|
|
952
|
+
logger(
|
|
953
|
+
f"[debug] 💊 {self.capsule_type} {self.identifier}: further_check_worker_readiness {_further_readiness_check_failed} | failure_condition_satisfied {failure_condition_satisfied}"
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
# We should still check for failure state and crash if we detect something in the readiness check
|
|
957
|
+
if failure_condition_satisfied or _further_readiness_check_failed:
|
|
958
|
+
# hit the logs endpoint for the worker and get the logs
|
|
959
|
+
# Print those logs out on the terminal
|
|
960
|
+
# raise an exception that should be caught gracefully by the cli
|
|
961
|
+
logs, worker_id = self._extract_logs_from_crashlooping_worker(
|
|
962
|
+
workers_state_machine
|
|
963
|
+
)
|
|
964
|
+
if logs is not None:
|
|
965
|
+
# todo: It would be really odd if the logs are not present and we discover something is crashlooping.
|
|
966
|
+
# Handle that condition later
|
|
967
|
+
logger(
|
|
968
|
+
*(
|
|
969
|
+
[
|
|
970
|
+
f"💥 Worker ID ({worker_id}) is crashlooping. Please check the following logs for more information: "
|
|
971
|
+
]
|
|
972
|
+
+ ["\t" + l["message"] for l in logs]
|
|
973
|
+
)
|
|
974
|
+
)
|
|
975
|
+
raise CapsuleCrashLoopException(
|
|
976
|
+
self.identifier,
|
|
977
|
+
worker_id=worker_id,
|
|
978
|
+
logs=logs,
|
|
979
|
+
)
|
|
980
|
+
|
|
981
|
+
if state_machine.ready_to_serve_traffic:
|
|
982
|
+
logger(
|
|
983
|
+
"💊 %s %s is ready to serve traffic on the URL: %s"
|
|
984
|
+
% (
|
|
985
|
+
self.capsule_type,
|
|
986
|
+
self.identifier,
|
|
987
|
+
state_machine.out_of_cluster_url,
|
|
988
|
+
),
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
break
|
|
992
|
+
|
|
993
|
+
self._publish_capsule_debug_info(
|
|
994
|
+
state_machine, workers_state_machine, capsule_response
|
|
995
|
+
)
|
|
996
|
+
|
|
997
|
+
if CAPSULE_DEBUG and i % 3 == 0: # Every 3 seconds report the status
|
|
998
|
+
logger(
|
|
999
|
+
f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status: {state_machine.current_status} | worker states: {workers_state_machine.current_status} | capsule_ready : {capsule_ready} | further_check_worker_readiness {further_check_worker_readiness}"
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
self._publish_capsule_debug_info(
|
|
1003
|
+
state_machine, workers_state_machine, capsule_response
|
|
1004
|
+
)
|
|
1005
|
+
|
|
1006
|
+
# We will only check ready_to_serve_traffic under the following conditions:
|
|
1007
|
+
# If the readiness condition is not Async and min_replicas in this deployment
|
|
1008
|
+
# instance is < 0
|
|
1009
|
+
_is_async_readiness = (
|
|
1010
|
+
self._success_terminal_state_condition == DEPLOYMENT_READY_CONDITIONS.ASYNC
|
|
1011
|
+
)
|
|
1012
|
+
if (
|
|
1013
|
+
min_replicas > 0
|
|
1014
|
+
and not _is_async_readiness
|
|
1015
|
+
and not self.status.ready_to_serve_traffic
|
|
1016
|
+
):
|
|
1017
|
+
raise CapsuleReadinessException(
|
|
1018
|
+
self.identifier,
|
|
1019
|
+
)
|
|
1020
|
+
auth_type = self._app_config.get_state("auth", {}).get("type", AuthType.default)
|
|
1021
|
+
return dict(
|
|
1022
|
+
id=self.identifier,
|
|
1023
|
+
auth_type=auth_type,
|
|
1024
|
+
public_url=self.url,
|
|
1025
|
+
available_replicas=self.status.available_replicas,
|
|
1026
|
+
name=self.name,
|
|
1027
|
+
deployed_version=self.current_deployment_instance_version,
|
|
1028
|
+
deployed_at=datetime.now().isoformat(),
|
|
1029
|
+
)
|