outerbounds 0.3.183rc1__py3-none-any.whl → 0.3.185__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- outerbounds/__init__.py +1 -3
- outerbounds/command_groups/apps_cli.py +6 -2
- {outerbounds-0.3.183rc1.dist-info → outerbounds-0.3.185.dist-info}/METADATA +3 -3
- {outerbounds-0.3.183rc1.dist-info → outerbounds-0.3.185.dist-info}/RECORD +6 -29
- outerbounds-0.3.185.dist-info/entry_points.txt +3 -0
- outerbounds/_vendor/spinner/__init__.py +0 -4
- outerbounds/_vendor/spinner/spinners.py +0 -478
- outerbounds/_vendor/spinner.LICENSE +0 -21
- outerbounds/apps/__init__.py +0 -0
- outerbounds/apps/_state_machine.py +0 -472
- outerbounds/apps/app_cli.py +0 -1514
- outerbounds/apps/app_config.py +0 -296
- outerbounds/apps/artifacts.py +0 -0
- outerbounds/apps/capsule.py +0 -839
- outerbounds/apps/cli_to_config.py +0 -99
- outerbounds/apps/click_importer.py +0 -24
- outerbounds/apps/code_package/__init__.py +0 -3
- outerbounds/apps/code_package/code_packager.py +0 -610
- outerbounds/apps/code_package/examples.py +0 -125
- outerbounds/apps/config_schema.yaml +0 -269
- outerbounds/apps/config_schema_autogen.json +0 -336
- outerbounds/apps/dependencies.py +0 -115
- outerbounds/apps/deployer.py +0 -0
- outerbounds/apps/experimental/__init__.py +0 -110
- outerbounds/apps/perimeters.py +0 -45
- outerbounds/apps/secrets.py +0 -164
- outerbounds/apps/utils.py +0 -234
- outerbounds/apps/validations.py +0 -22
- outerbounds-0.3.183rc1.dist-info/entry_points.txt +0 -3
- {outerbounds-0.3.183rc1.dist-info → outerbounds-0.3.185.dist-info}/WHEEL +0 -0
outerbounds/apps/capsule.py
DELETED
@@ -1,839 +0,0 @@
|
|
1
|
-
from datetime import datetime
|
2
|
-
import json
|
3
|
-
import os
|
4
|
-
import pathlib
|
5
|
-
import requests
|
6
|
-
import sys
|
7
|
-
import time
|
8
|
-
from functools import partial
|
9
|
-
import shlex
|
10
|
-
from typing import Optional, List, Dict, Any, Tuple, Union
|
11
|
-
from .utils import TODOException, safe_requests_wrapper, MaximumRetriesExceeded
|
12
|
-
from .app_config import AppConfig, CAPSULE_DEBUG, AuthType
|
13
|
-
from . import experimental
|
14
|
-
from ._state_machine import (
|
15
|
-
_capsule_worker_semantic_status,
|
16
|
-
_capsule_worker_status_diff,
|
17
|
-
CapsuleWorkerSemanticStatus,
|
18
|
-
WorkerStatus,
|
19
|
-
CapsuleStatus,
|
20
|
-
DEPLOYMENT_READY_CONDITIONS,
|
21
|
-
)
|
22
|
-
|
23
|
-
|
24
|
-
class CapsuleStateMachine:
|
25
|
-
"""
|
26
|
-
- Every capsule create call will return a `identifier` and a `version` of the object.
|
27
|
-
- Each update call will return a new version.
|
28
|
-
- The status.currentlyServedVersion will be the version that is currently serving traffic.
|
29
|
-
- The status.updateInProgress will be True if an upgrade is in progress.
|
30
|
-
|
31
|
-
CapsuleState Transition:
|
32
|
-
- Every capsule create call will return a `identifier` and a `version` of the object.
|
33
|
-
- Happy Path:
|
34
|
-
- First time Create :
|
35
|
-
- wait for status.updateInProgress to be set to False
|
36
|
-
- (interleved) Poll the worker endpoints to check their status
|
37
|
-
- showcase how many workers are coming up if things are on the cli side.
|
38
|
-
- If the user has set some flag like `--dont-wait-to-fully-finish` then we check the `status.currentlyServedVersion` to see if even one replica is ready to
|
39
|
-
serve traffic.
|
40
|
-
- once the status.updateInProgress is set to False, it means that the replicas are ready
|
41
|
-
- Upgrade:
|
42
|
-
- wait for status.updateInProgress to be set to False
|
43
|
-
- (interleved) Poll the worker endpoints to check their status and signal the user the number replicas coming up
|
44
|
-
- If the user has set some flag like `--dont-wait-to-fully-finish` then we check the `status.currentlyServedVersion` to see if even one replica is ready to
|
45
|
-
serve traffic.
|
46
|
-
- Unhappy Path:
|
47
|
-
- First time Create :
|
48
|
-
- wait for status.updateInProgress to be set to False,
|
49
|
-
- (interleved) Poll the workers to check their status.
|
50
|
-
- If the worker pertaining the current deployment instance version is crashlooping then crash the deployment process with the error messages and logs.
|
51
|
-
- Upgrade:
|
52
|
-
- wait for status.updateInProgress to be set to False,
|
53
|
-
- (interleved) Poll the workers to check their status.
|
54
|
-
- If the worker pertaining the current deployment instance version is crashlooping then crash the deployment process with the error messages and logs.
|
55
|
-
|
56
|
-
"""
|
57
|
-
|
58
|
-
def __init__(self, capsule_id: str, current_deployment_instance_version: str):
|
59
|
-
self._capsule_id = capsule_id
|
60
|
-
self._status_trail: List[Dict[str, Any]] = []
|
61
|
-
self._current_deployment_instance_version = current_deployment_instance_version
|
62
|
-
|
63
|
-
def get_status_trail(self):
|
64
|
-
return self._status_trail
|
65
|
-
|
66
|
-
def add_status(self, status: CapsuleStatus):
|
67
|
-
assert type(status) == dict, "TODO: Make this check somewhere else"
|
68
|
-
self._status_trail.append({"timestamp": time.time(), "status": status})
|
69
|
-
|
70
|
-
@property
|
71
|
-
def current_status(self):
|
72
|
-
return self._status_trail[-1].get("status")
|
73
|
-
|
74
|
-
@property
|
75
|
-
def out_of_cluster_url(self):
|
76
|
-
access_info = self.current_status.get("accessInfo", {}) or {}
|
77
|
-
url = access_info.get("outOfClusterURL", None)
|
78
|
-
if url is not None:
|
79
|
-
return f"https://{url}"
|
80
|
-
return None
|
81
|
-
|
82
|
-
@property
|
83
|
-
def in_cluster_url(self):
|
84
|
-
access_info = self.current_status.get("accessInfo", {}) or {}
|
85
|
-
url = access_info.get("inClusterURL", None)
|
86
|
-
if url is not None:
|
87
|
-
return f"https://{url}"
|
88
|
-
return None
|
89
|
-
|
90
|
-
@property
|
91
|
-
def update_in_progress(self):
|
92
|
-
return self.current_status.get("updateInProgress", False)
|
93
|
-
|
94
|
-
@property
|
95
|
-
def currently_served_version(self):
|
96
|
-
return self.current_status.get("currentlyServedVersion", None)
|
97
|
-
|
98
|
-
@property
|
99
|
-
def ready_to_serve_traffic(self):
|
100
|
-
if self.current_status.get("readyToServeTraffic", False):
|
101
|
-
return any(
|
102
|
-
i is not None for i in [self.out_of_cluster_url, self.in_cluster_url]
|
103
|
-
)
|
104
|
-
return False
|
105
|
-
|
106
|
-
@property
|
107
|
-
def available_replicas(self):
|
108
|
-
return self.current_status.get("availableReplicas", 0)
|
109
|
-
|
110
|
-
def report_current_status(self, logger):
|
111
|
-
pass
|
112
|
-
|
113
|
-
def save_debug_info(self, state_dir: str):
|
114
|
-
debug_path = os.path.join(state_dir, f"debug_capsule_{self._capsule_id}.json")
|
115
|
-
with open(debug_path, "w") as f:
|
116
|
-
json.dump(self._status_trail, f, indent=4)
|
117
|
-
|
118
|
-
|
119
|
-
class CapsuleWorkersStateMachine:
|
120
|
-
def __init__(
|
121
|
-
self,
|
122
|
-
capsule_id: str,
|
123
|
-
end_state_capsule_version: str,
|
124
|
-
deployment_mode: str = DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING,
|
125
|
-
minimum_replicas: int = 1,
|
126
|
-
):
|
127
|
-
self._capsule_id = capsule_id
|
128
|
-
self._end_state_capsule_version = end_state_capsule_version
|
129
|
-
self._deployment_mode = deployment_mode
|
130
|
-
self._minimum_replicas = minimum_replicas
|
131
|
-
self._status_trail: List[Dict[str, Union[float, List[WorkerStatus]]]] = []
|
132
|
-
|
133
|
-
def get_status_trail(self):
|
134
|
-
return self._status_trail
|
135
|
-
|
136
|
-
def add_status(self, worker_list_response: List[WorkerStatus]):
|
137
|
-
"""
|
138
|
-
worker_list_response: List[Dict[str, Any]]
|
139
|
-
[
|
140
|
-
{
|
141
|
-
"workerId": "c-4pqikm-659dd9ccdc-5hcwz",
|
142
|
-
"phase": "Running",
|
143
|
-
"activity": 0,
|
144
|
-
"activityDataAvailable": false,
|
145
|
-
"version": "0xhgaewiqb"
|
146
|
-
},
|
147
|
-
{
|
148
|
-
"workerId": "c-4pqikm-b8559688b-xk2jh",
|
149
|
-
"phase": "Pending",
|
150
|
-
"activity": 0,
|
151
|
-
"activityDataAvailable": false,
|
152
|
-
"version": "421h48qh95"
|
153
|
-
}
|
154
|
-
]
|
155
|
-
"""
|
156
|
-
self._status_trail.append(
|
157
|
-
{"timestamp": time.time(), "status": worker_list_response}
|
158
|
-
)
|
159
|
-
|
160
|
-
def save_debug_info(self, state_dir: str):
|
161
|
-
debug_path = os.path.join(
|
162
|
-
state_dir, f"debug_capsule_workers_{self._capsule_id}_trail.json"
|
163
|
-
)
|
164
|
-
with open(debug_path, "w") as f:
|
165
|
-
json.dump(self._status_trail, f, indent=4)
|
166
|
-
|
167
|
-
status_path = os.path.join(
|
168
|
-
state_dir, f"debug_capsule_workers_{self._capsule_id}_status.json"
|
169
|
-
)
|
170
|
-
with open(status_path, "w") as f:
|
171
|
-
json.dump(self.current_version_deployment_status(), f, indent=4)
|
172
|
-
|
173
|
-
def report_current_status(self, logger):
|
174
|
-
if len(self._status_trail) == 0:
|
175
|
-
return
|
176
|
-
older_status = None
|
177
|
-
if len(self._status_trail) >= 2:
|
178
|
-
older_status = _capsule_worker_semantic_status(
|
179
|
-
self._status_trail[-2].get("status"),
|
180
|
-
self._end_state_capsule_version,
|
181
|
-
self._minimum_replicas,
|
182
|
-
)
|
183
|
-
current_status = self.current_version_deployment_status()
|
184
|
-
changes = _capsule_worker_status_diff(current_status, older_status)
|
185
|
-
if len(changes) > 0:
|
186
|
-
logger(*changes)
|
187
|
-
|
188
|
-
@property
|
189
|
-
def current_status(self) -> List[WorkerStatus]:
|
190
|
-
return self._status_trail[-1].get("status") # type: ignore
|
191
|
-
|
192
|
-
def current_version_deployment_status(self) -> CapsuleWorkerSemanticStatus:
|
193
|
-
return _capsule_worker_semantic_status(
|
194
|
-
self.current_status, self._end_state_capsule_version, self._minimum_replicas
|
195
|
-
)
|
196
|
-
|
197
|
-
@property
|
198
|
-
def is_crashlooping(self) -> bool:
|
199
|
-
status = self.current_version_deployment_status()
|
200
|
-
return status["status"]["at_least_one_crashlooping"]
|
201
|
-
|
202
|
-
|
203
|
-
class CapsuleInput:
|
204
|
-
@classmethod
|
205
|
-
def construct_exec_command(cls, commands: list[str]):
|
206
|
-
commands = ["set -eEuo pipefail"] + commands
|
207
|
-
command_string = "\n".join(commands)
|
208
|
-
# First constuct a base64 encoded string of the quoted command
|
209
|
-
# One of the reasons we don't directly pass the command string to the backend with a `\n` join
|
210
|
-
# is because the backend controller doesnt play nice when the command can be a multi-line string.
|
211
|
-
# So we encode it to a base64 string and then decode it back to a command string at runtime to provide to
|
212
|
-
# `bash -c`. The ideal thing to have done is to run "bash -c {shlex.quote(command_string)}" and call it a day
|
213
|
-
# but the backend controller yields the following error:
|
214
|
-
# `error parsing template: error converting YAML to JSON: yaml: line 111: mapping values are not allowed in this context`
|
215
|
-
# So we go to great length to ensure the command is provided in base64 to avoid any issues with the backend controller.
|
216
|
-
import base64
|
217
|
-
|
218
|
-
encoded_command = base64.b64encode(command_string.encode()).decode()
|
219
|
-
decode_cmd = f"echo {encoded_command} | base64 -d > ./_ob_app_run.sh"
|
220
|
-
return (
|
221
|
-
f"bash -c '{decode_cmd} && cat ./_ob_app_run.sh && bash ./_ob_app_run.sh'"
|
222
|
-
)
|
223
|
-
|
224
|
-
@classmethod
|
225
|
-
def _marshal_environment_variables(cls, app_config: AppConfig):
|
226
|
-
envs = app_config.get_state("environment", {}).copy()
|
227
|
-
_return = []
|
228
|
-
for k, v in envs.items():
|
229
|
-
_v = v
|
230
|
-
if isinstance(v, dict):
|
231
|
-
_v = json.dumps(v)
|
232
|
-
elif isinstance(v, list):
|
233
|
-
_v = json.dumps(v)
|
234
|
-
else:
|
235
|
-
_v = str(v)
|
236
|
-
_return.append(
|
237
|
-
{
|
238
|
-
"name": k,
|
239
|
-
"value": _v,
|
240
|
-
}
|
241
|
-
)
|
242
|
-
return _return
|
243
|
-
|
244
|
-
@classmethod
|
245
|
-
def from_app_config(self, app_config: AppConfig):
|
246
|
-
gpu_resource = app_config.get_state("resources").get("gpu")
|
247
|
-
resources = {}
|
248
|
-
shared_memory = app_config.get_state("resources").get("shared_memory")
|
249
|
-
if gpu_resource:
|
250
|
-
resources["gpu"] = gpu_resource
|
251
|
-
if shared_memory:
|
252
|
-
resources["sharedMemory"] = shared_memory
|
253
|
-
|
254
|
-
_scheduling_config = {}
|
255
|
-
if app_config.get_state("compute_pools", None):
|
256
|
-
_scheduling_config["schedulingConfig"] = {
|
257
|
-
"computePools": [
|
258
|
-
{"name": x} for x in app_config.get_state("compute_pools")
|
259
|
-
]
|
260
|
-
}
|
261
|
-
_description = app_config.get_state("description")
|
262
|
-
_app_type = app_config.get_state("app_type")
|
263
|
-
_final_info = {}
|
264
|
-
if _description:
|
265
|
-
_final_info["description"] = _description
|
266
|
-
if _app_type:
|
267
|
-
_final_info["endpointType"] = _app_type
|
268
|
-
return {
|
269
|
-
"perimeter": app_config.get_state("perimeter"),
|
270
|
-
**_final_info,
|
271
|
-
"codePackagePath": app_config.get_state("code_package_url"),
|
272
|
-
"image": app_config.get_state("image"),
|
273
|
-
"resourceIntegrations": [
|
274
|
-
{"name": x} for x in app_config.get_state("secrets", [])
|
275
|
-
],
|
276
|
-
"resourceConfig": {
|
277
|
-
"cpu": str(app_config.get_state("resources").get("cpu")),
|
278
|
-
"memory": str(app_config.get_state("resources").get("memory")),
|
279
|
-
"ephemeralStorage": str(app_config.get_state("resources").get("disk")),
|
280
|
-
**resources,
|
281
|
-
},
|
282
|
-
"autoscalingConfig": {
|
283
|
-
"minReplicas": app_config.get_state("replicas", {}).get("min", 1),
|
284
|
-
"maxReplicas": app_config.get_state("replicas", {}).get("max", 1),
|
285
|
-
},
|
286
|
-
**_scheduling_config,
|
287
|
-
"containerStartupConfig": {
|
288
|
-
"entrypoint": self.construct_exec_command(
|
289
|
-
app_config.get_state("commands")
|
290
|
-
)
|
291
|
-
},
|
292
|
-
"environmentVariables": self._marshal_environment_variables(app_config),
|
293
|
-
# "assets": [{"name": "startup-script.sh"}],
|
294
|
-
"authConfig": {
|
295
|
-
"authType": app_config.get_state("auth").get("type"),
|
296
|
-
"publicToDeployment": app_config.get_state("auth").get("public"),
|
297
|
-
},
|
298
|
-
"tags": [
|
299
|
-
dict(key=k, value=v)
|
300
|
-
for tag in app_config.get_state("tags", [])
|
301
|
-
for k, v in tag.items()
|
302
|
-
],
|
303
|
-
"port": app_config.get_state("port"),
|
304
|
-
"displayName": app_config.get_state("name"),
|
305
|
-
"forceUpdate": app_config.get_state("force_upgrade", False),
|
306
|
-
}
|
307
|
-
|
308
|
-
|
309
|
-
class CapsuleApiException(Exception):
|
310
|
-
def __init__(
|
311
|
-
self,
|
312
|
-
url: str,
|
313
|
-
method: str,
|
314
|
-
status_code: int,
|
315
|
-
text: str,
|
316
|
-
message: Optional[str] = None,
|
317
|
-
):
|
318
|
-
self.url = url
|
319
|
-
self.method = method
|
320
|
-
self.status_code = status_code
|
321
|
-
self.text = text
|
322
|
-
self.message = message
|
323
|
-
|
324
|
-
def __str__(self):
|
325
|
-
return (
|
326
|
-
f"CapsuleApiException: {self.url} [{self.method}]: Status Code: {self.status_code} \n\n {self.text}"
|
327
|
-
+ (f"\n\n {self.message}" if self.message else "")
|
328
|
-
)
|
329
|
-
|
330
|
-
|
331
|
-
class CapsuleDeploymentException(Exception):
|
332
|
-
def __init__(
|
333
|
-
self,
|
334
|
-
capsule_id: str,
|
335
|
-
message: str,
|
336
|
-
):
|
337
|
-
self.capsule_id = capsule_id
|
338
|
-
self.message = message
|
339
|
-
|
340
|
-
def __str__(self):
|
341
|
-
return f"CapsuleDeploymentException: [{self.capsule_id}] :: {self.message}"
|
342
|
-
|
343
|
-
|
344
|
-
class CapsuleApi:
|
345
|
-
def __init__(self, base_url: str, perimeter: str, logger_fn=None):
|
346
|
-
self._base_url = self._create_base_url(base_url, perimeter)
|
347
|
-
from metaflow.metaflow_config import SERVICE_HEADERS
|
348
|
-
|
349
|
-
self._logger_fn = logger_fn
|
350
|
-
self._request_headers = {
|
351
|
-
**{"Content-Type": "application/json", "Connection": "keep-alive"},
|
352
|
-
**(SERVICE_HEADERS or {}),
|
353
|
-
}
|
354
|
-
|
355
|
-
@staticmethod
|
356
|
-
def _create_base_url(base_url: str, perimeter: str):
|
357
|
-
return os.path.join(
|
358
|
-
base_url,
|
359
|
-
"v1",
|
360
|
-
"perimeters",
|
361
|
-
perimeter,
|
362
|
-
"capsules",
|
363
|
-
)
|
364
|
-
|
365
|
-
def _wrapped_api_caller(self, method_func, *args, **kwargs):
|
366
|
-
try:
|
367
|
-
response = safe_requests_wrapper(
|
368
|
-
method_func,
|
369
|
-
*args,
|
370
|
-
headers=self._request_headers,
|
371
|
-
logger_fn=self._logger_fn,
|
372
|
-
**kwargs,
|
373
|
-
)
|
374
|
-
except MaximumRetriesExceeded as e:
|
375
|
-
raise CapsuleApiException(
|
376
|
-
e.url,
|
377
|
-
e.method,
|
378
|
-
e.status_code,
|
379
|
-
e.text,
|
380
|
-
message=f"Maximum retries exceeded for {e.url} [{e.method}]",
|
381
|
-
)
|
382
|
-
if response.status_code >= 400:
|
383
|
-
raise CapsuleApiException(
|
384
|
-
args[0],
|
385
|
-
method_func.__name__,
|
386
|
-
response.status_code,
|
387
|
-
response.text,
|
388
|
-
)
|
389
|
-
return response
|
390
|
-
|
391
|
-
def create(self, capsule_input: dict):
|
392
|
-
_data = json.dumps(capsule_input)
|
393
|
-
response = self._wrapped_api_caller(
|
394
|
-
requests.post,
|
395
|
-
self._base_url,
|
396
|
-
data=_data,
|
397
|
-
)
|
398
|
-
try:
|
399
|
-
return response.json()
|
400
|
-
except json.JSONDecodeError as e:
|
401
|
-
raise CapsuleApiException(
|
402
|
-
self._base_url,
|
403
|
-
"post",
|
404
|
-
response.status_code,
|
405
|
-
response.text,
|
406
|
-
message="Capsule JSON decode failed",
|
407
|
-
)
|
408
|
-
|
409
|
-
def get(self, capsule_id: str):
|
410
|
-
_url = os.path.join(self._base_url, capsule_id)
|
411
|
-
response = self._wrapped_api_caller(
|
412
|
-
requests.get,
|
413
|
-
_url,
|
414
|
-
retryable_status_codes=[409, 404], # todo : verify me
|
415
|
-
conn_error_retries=3,
|
416
|
-
)
|
417
|
-
try:
|
418
|
-
return response.json()
|
419
|
-
except json.JSONDecodeError as e:
|
420
|
-
raise CapsuleApiException(
|
421
|
-
_url,
|
422
|
-
"get",
|
423
|
-
response.status_code,
|
424
|
-
response.text,
|
425
|
-
message="Capsule JSON decode failed",
|
426
|
-
)
|
427
|
-
|
428
|
-
def list(self):
|
429
|
-
response = self._wrapped_api_caller(
|
430
|
-
requests.get,
|
431
|
-
self._base_url,
|
432
|
-
retryable_status_codes=[409], # todo : verify me
|
433
|
-
conn_error_retries=3,
|
434
|
-
)
|
435
|
-
try:
|
436
|
-
response_json = response.json()
|
437
|
-
except json.JSONDecodeError as e:
|
438
|
-
raise CapsuleApiException(
|
439
|
-
self._base_url,
|
440
|
-
"get",
|
441
|
-
response.status_code,
|
442
|
-
response.text,
|
443
|
-
message="Capsule JSON decode failed",
|
444
|
-
)
|
445
|
-
if "capsules" not in response_json:
|
446
|
-
raise CapsuleApiException(
|
447
|
-
self._base_url,
|
448
|
-
"get",
|
449
|
-
response.status_code,
|
450
|
-
response.text,
|
451
|
-
message="Capsule JSON decode failed",
|
452
|
-
)
|
453
|
-
return response_json.get("capsules", []) or []
|
454
|
-
|
455
|
-
def delete(self, capsule_id: str):
|
456
|
-
_url = os.path.join(self._base_url, capsule_id)
|
457
|
-
response = self._wrapped_api_caller(
|
458
|
-
requests.delete,
|
459
|
-
_url,
|
460
|
-
retryable_status_codes=[409], # todo : verify me
|
461
|
-
)
|
462
|
-
if response.status_code >= 400:
|
463
|
-
raise CapsuleApiException(
|
464
|
-
_url,
|
465
|
-
"delete",
|
466
|
-
response.status_code,
|
467
|
-
response.text,
|
468
|
-
)
|
469
|
-
|
470
|
-
if response.status_code == 200:
|
471
|
-
return True
|
472
|
-
return False
|
473
|
-
|
474
|
-
def get_workers(self, capsule_id: str) -> List[Dict[str, Any]]:
|
475
|
-
_url = os.path.join(self._base_url, capsule_id, "workers")
|
476
|
-
response = self._wrapped_api_caller(
|
477
|
-
requests.get,
|
478
|
-
_url,
|
479
|
-
retryable_status_codes=[409, 404], # todo : verify me
|
480
|
-
# Adding 404s because sometimes we might even end up getting 404s if
|
481
|
-
# the backend cache is not updated yet. So on consistent 404s we should
|
482
|
-
# just crash out.
|
483
|
-
conn_error_retries=3,
|
484
|
-
)
|
485
|
-
try:
|
486
|
-
return response.json().get("workers", []) or []
|
487
|
-
except json.JSONDecodeError as e:
|
488
|
-
raise CapsuleApiException(
|
489
|
-
_url,
|
490
|
-
"get",
|
491
|
-
response.status_code,
|
492
|
-
response.text,
|
493
|
-
message="Capsule JSON decode failed",
|
494
|
-
)
|
495
|
-
|
496
|
-
def logs(
|
497
|
-
self, capsule_id: str, worker_id: str, previous: bool = False
|
498
|
-
) -> List[str]:
|
499
|
-
_url = os.path.join(self._base_url, capsule_id, "workers", worker_id, "logs")
|
500
|
-
options = None
|
501
|
-
if previous:
|
502
|
-
options = {"previous": True}
|
503
|
-
response = self._wrapped_api_caller(
|
504
|
-
requests.get,
|
505
|
-
_url,
|
506
|
-
retryable_status_codes=[409], # todo : verify me
|
507
|
-
params=options,
|
508
|
-
)
|
509
|
-
try:
|
510
|
-
return response.json().get("logs", []) or []
|
511
|
-
except json.JSONDecodeError as e:
|
512
|
-
raise CapsuleApiException(
|
513
|
-
_url,
|
514
|
-
"get",
|
515
|
-
response.status_code,
|
516
|
-
response.text,
|
517
|
-
message="Capsule JSON decode failed",
|
518
|
-
)
|
519
|
-
|
520
|
-
|
521
|
-
def list_and_filter_capsules(
|
522
|
-
capsule_api: CapsuleApi, project, branch, name, tags, auth_type, capsule_id
|
523
|
-
):
|
524
|
-
capsules = capsule_api.list()
|
525
|
-
|
526
|
-
def _tags_match(tags, key, value):
|
527
|
-
for t in tags:
|
528
|
-
if t["key"] == key and t["value"] == value:
|
529
|
-
return True
|
530
|
-
return False
|
531
|
-
|
532
|
-
def _all_tags_match(tags, tags_to_match):
|
533
|
-
return all([_tags_match(tags, t["key"], t["value"]) for t in tags_to_match])
|
534
|
-
|
535
|
-
def _filter_capsules(capsules, project, branch, name, tags, auth_type, capsule_id):
|
536
|
-
_filtered_capsules = []
|
537
|
-
for capsule in capsules:
|
538
|
-
set_tags = capsule.get("spec", {}).get("tags", [])
|
539
|
-
display_name = capsule.get("spec", {}).get("displayName", None)
|
540
|
-
set_id = capsule.get("id", None)
|
541
|
-
set_auth_type = (
|
542
|
-
capsule.get("spec", {}).get("authConfig", {}).get("authType", None)
|
543
|
-
)
|
544
|
-
|
545
|
-
if auth_type and set_auth_type != auth_type:
|
546
|
-
continue
|
547
|
-
if project and not _tags_match(set_tags, "project", project):
|
548
|
-
continue
|
549
|
-
if branch and not _tags_match(set_tags, "branch", branch):
|
550
|
-
continue
|
551
|
-
if name and display_name != name:
|
552
|
-
continue
|
553
|
-
if tags and not _all_tags_match(set_tags, tags):
|
554
|
-
continue
|
555
|
-
if capsule_id and set_id != capsule_id:
|
556
|
-
continue
|
557
|
-
|
558
|
-
_filtered_capsules.append(capsule)
|
559
|
-
return _filtered_capsules
|
560
|
-
|
561
|
-
return _filter_capsules(
|
562
|
-
capsules, project, branch, name, tags, auth_type, capsule_id
|
563
|
-
)
|
564
|
-
|
565
|
-
|
566
|
-
from collections import namedtuple
|
567
|
-
|
568
|
-
CapsuleInfo = namedtuple("CapsuleInfo", ["info", "workers"])
|
569
|
-
|
570
|
-
|
571
|
-
class CapsuleDeployer:
|
572
|
-
|
573
|
-
status: CapsuleStateMachine
|
574
|
-
|
575
|
-
identifier = None
|
576
|
-
|
577
|
-
# TODO: Current default timeout is very large of 5 minutes. Ideally we should have finished the deployed in less than 1 minutes.
|
578
|
-
def __init__(
|
579
|
-
self,
|
580
|
-
app_config: AppConfig,
|
581
|
-
base_url: str,
|
582
|
-
create_timeout: int = 60 * 5,
|
583
|
-
debug_dir: Optional[str] = None,
|
584
|
-
success_terminal_state_condition: str = DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING,
|
585
|
-
readiness_wait_time: int = 20,
|
586
|
-
logger_fn=None,
|
587
|
-
):
|
588
|
-
self._app_config = app_config
|
589
|
-
self._capsule_api = CapsuleApi(
|
590
|
-
base_url,
|
591
|
-
app_config.get_state("perimeter"),
|
592
|
-
logger_fn=logger_fn or partial(print, file=sys.stderr),
|
593
|
-
)
|
594
|
-
self._create_timeout = create_timeout
|
595
|
-
self._logger_fn = logger_fn
|
596
|
-
self._debug_dir = debug_dir
|
597
|
-
self._capsule_deploy_response = None
|
598
|
-
self._success_terminal_state_condition = success_terminal_state_condition
|
599
|
-
self._readiness_wait_time = readiness_wait_time
|
600
|
-
|
601
|
-
@property
|
602
|
-
def capsule_api(self):
|
603
|
-
return self._capsule_api
|
604
|
-
|
605
|
-
@property
|
606
|
-
def capsule_type(self):
|
607
|
-
auth_type = self._app_config.get_state("auth", {}).get("type", AuthType.default)
|
608
|
-
if auth_type == AuthType.BROWSER:
|
609
|
-
return "App"
|
610
|
-
elif auth_type == AuthType.API:
|
611
|
-
return "Endpoint"
|
612
|
-
else:
|
613
|
-
raise TODOException(f"Unknown auth type: {auth_type}")
|
614
|
-
|
615
|
-
@property
|
616
|
-
def name(self):
|
617
|
-
return self._app_config.get_state("name")
|
618
|
-
|
619
|
-
def create_input(self):
|
620
|
-
return experimental.capsule_input_overrides(
|
621
|
-
self._app_config, CapsuleInput.from_app_config(self._app_config)
|
622
|
-
)
|
623
|
-
|
624
|
-
@property
|
625
|
-
def current_deployment_instance_version(self):
|
626
|
-
"""
|
627
|
-
The backend `create` call returns a version of the object that will be
|
628
|
-
"""
|
629
|
-
if self._capsule_deploy_response is None:
|
630
|
-
return None
|
631
|
-
return self._capsule_deploy_response.get("version", None)
|
632
|
-
|
633
|
-
def create(self):
|
634
|
-
capsule_response = self._capsule_api.create(self.create_input())
|
635
|
-
self.identifier = capsule_response.get("id")
|
636
|
-
self._capsule_deploy_response = capsule_response
|
637
|
-
return self.identifier
|
638
|
-
|
639
|
-
def get(self):
|
640
|
-
return self._capsule_api.get(self.identifier)
|
641
|
-
|
642
|
-
def get_workers(self):
|
643
|
-
return self._capsule_api.get_workers(self.identifier)
|
644
|
-
|
645
|
-
def _backend_version_mismatch_check(
|
646
|
-
self, capsule_response: dict, current_deployment_instance_version: str
|
647
|
-
):
|
648
|
-
"""
|
649
|
-
- `capsule_response.version` contains the version of the object present in the database
|
650
|
-
- `current_deployment_instance_version` contains the version of the object that was deployed by this instance of the deployer.
|
651
|
-
In the situtation that the versions of the objects become a mismatch then it means that current deployment process is not giving the user the
|
652
|
-
output that they desire.
|
653
|
-
"""
|
654
|
-
if capsule_response.get("version", None) != current_deployment_instance_version:
|
655
|
-
raise CapsuleDeploymentException(
|
656
|
-
self.identifier, # type: ignore
|
657
|
-
f"A capsule upgrade was triggered outside current deployment instance. Current deployment version was discarded. Current deployment version: {current_deployment_instance_version} and new version: {capsule_response.get('version', None)}",
|
658
|
-
)
|
659
|
-
|
660
|
-
def _monitor_worker_readiness(
|
661
|
-
self,
|
662
|
-
workers_sm: "CapsuleWorkersStateMachine",
|
663
|
-
):
|
664
|
-
"""returns True if the worker is crashlooping, False otherwise"""
|
665
|
-
logger = self._logger_fn or partial(print, file=sys.stderr)
|
666
|
-
for i in range(self._readiness_wait_time):
|
667
|
-
time.sleep(1)
|
668
|
-
workers_response = self.get_workers()
|
669
|
-
workers_sm.add_status(workers_response)
|
670
|
-
workers_sm.report_current_status(logger)
|
671
|
-
if workers_sm.is_crashlooping:
|
672
|
-
return True
|
673
|
-
return False
|
674
|
-
|
675
|
-
def _extract_logs_from_crashlooping_worker(
|
676
|
-
self, workers_sm: "CapsuleWorkersStateMachine"
|
677
|
-
):
|
678
|
-
def _extract_worker_id_of_crashlooping_worker(
|
679
|
-
workers_status: List[WorkerStatus],
|
680
|
-
):
|
681
|
-
for worker in workers_status:
|
682
|
-
if worker["phase"] == "CrashLoopBackOff":
|
683
|
-
return worker["workerId"]
|
684
|
-
return None
|
685
|
-
|
686
|
-
worker_id = _extract_worker_id_of_crashlooping_worker(workers_sm.current_status)
|
687
|
-
if worker_id is None:
|
688
|
-
return None, None
|
689
|
-
logs = self.capsule_api.logs(self.identifier, worker_id, previous=True)
|
690
|
-
return logs, worker_id
|
691
|
-
|
692
|
-
def wait_for_terminal_state(
|
693
|
-
self,
|
694
|
-
):
|
695
|
-
""" """
|
696
|
-
logger = self._logger_fn or partial(print, file=sys.stderr)
|
697
|
-
state_machine = CapsuleStateMachine(
|
698
|
-
self.identifier, self.current_deployment_instance_version
|
699
|
-
)
|
700
|
-
# min_replicas will always be present
|
701
|
-
min_replicas = self._app_config.get_state("replicas", {}).get("min")
|
702
|
-
workers_state_machine = CapsuleWorkersStateMachine(
|
703
|
-
self.identifier,
|
704
|
-
self.current_deployment_instance_version,
|
705
|
-
deployment_mode=self._success_terminal_state_condition,
|
706
|
-
minimum_replicas=min_replicas,
|
707
|
-
)
|
708
|
-
self.status = state_machine
|
709
|
-
for i in range(self._create_timeout):
|
710
|
-
time.sleep(1)
|
711
|
-
capsule_response = self.get()
|
712
|
-
workers_response = self.get_workers()
|
713
|
-
|
714
|
-
# We first need to check if someone has not upgraded the capsule under the hood and
|
715
|
-
# the current deployment instance is invalid.
|
716
|
-
self._backend_version_mismatch_check(
|
717
|
-
capsule_response, self.current_deployment_instance_version
|
718
|
-
)
|
719
|
-
state_machine.add_status(capsule_response.get("status", {}))
|
720
|
-
workers_state_machine.add_status(workers_response)
|
721
|
-
state_machine.report_current_status(logger)
|
722
|
-
|
723
|
-
workers_state_machine.report_current_status(logger)
|
724
|
-
# Deployment readiness checks will determine what is the terminal state
|
725
|
-
# of the workerstate machine. If we detect a terminal state in the workers,
|
726
|
-
# then even if the capsule upgrade is still in progress we will end up crashing
|
727
|
-
# the deployment.
|
728
|
-
(
|
729
|
-
capsule_ready,
|
730
|
-
further_check_worker_readiness,
|
731
|
-
) = DEPLOYMENT_READY_CONDITIONS.check_readiness_condition(
|
732
|
-
state_machine.current_status,
|
733
|
-
workers_state_machine.current_version_deployment_status(),
|
734
|
-
self._success_terminal_state_condition,
|
735
|
-
)
|
736
|
-
|
737
|
-
failure_condition_satisfied = (
|
738
|
-
DEPLOYMENT_READY_CONDITIONS.check_failure_condition(
|
739
|
-
state_machine.current_status,
|
740
|
-
workers_state_machine.current_version_deployment_status(),
|
741
|
-
)
|
742
|
-
)
|
743
|
-
if capsule_ready or failure_condition_satisfied:
|
744
|
-
logger(
|
745
|
-
"💊 %s deployment status: %s | worker states: [success :%s | failure :%s ] "
|
746
|
-
% (
|
747
|
-
self.capsule_type.title(),
|
748
|
-
"in progress"
|
749
|
-
if state_machine.update_in_progress
|
750
|
-
else "completed",
|
751
|
-
capsule_ready,
|
752
|
-
failure_condition_satisfied,
|
753
|
-
)
|
754
|
-
)
|
755
|
-
_further_readiness_check_failed = False
|
756
|
-
if further_check_worker_readiness:
|
757
|
-
# HACK : monitor the workers for N seconds to make sure they are healthy
|
758
|
-
# this is a hack. Ideally we should implment a healtcheck as a first class citizen
|
759
|
-
# but it will take some time to do that so in the meanwhile a timeout set on the cli
|
760
|
-
# side will be really helpful.
|
761
|
-
logger(
|
762
|
-
"💊 running last minute readiness check for %s..."
|
763
|
-
% self.identifier
|
764
|
-
)
|
765
|
-
_further_readiness_check_failed = self._monitor_worker_readiness(
|
766
|
-
workers_state_machine
|
767
|
-
)
|
768
|
-
|
769
|
-
if CAPSULE_DEBUG:
|
770
|
-
logger(
|
771
|
-
f"[debug] 💊 {self.capsule_type} {self.identifier}: further_check_worker_readiness {_further_readiness_check_failed} | failure_condition_satisfied {failure_condition_satisfied}"
|
772
|
-
)
|
773
|
-
|
774
|
-
# We should still check for failure state and crash if we detect something in the readiness check
|
775
|
-
if failure_condition_satisfied or _further_readiness_check_failed:
|
776
|
-
# hit the logs endpoint for the worker and get the logs
|
777
|
-
# Print those logs out on the terminal
|
778
|
-
# raise an exception that should be caught gracefully by the cli
|
779
|
-
logs, worker_id = self._extract_logs_from_crashlooping_worker(
|
780
|
-
workers_state_machine
|
781
|
-
)
|
782
|
-
if logs is not None:
|
783
|
-
# todo: It would be really odd if the logs are not present and we discover something is crashlooping.
|
784
|
-
# Handle that condition later
|
785
|
-
logger(
|
786
|
-
*(
|
787
|
-
[
|
788
|
-
f"💥 Worker ID ({worker_id}) is crashlooping. Please check the following logs for more information: "
|
789
|
-
]
|
790
|
-
+ ["\t" + l["message"] for l in logs]
|
791
|
-
)
|
792
|
-
)
|
793
|
-
raise CapsuleDeploymentException(
|
794
|
-
self.identifier,
|
795
|
-
f"Worker ID ({worker_id}) is crashlooping. Please check the logs for more information.",
|
796
|
-
)
|
797
|
-
|
798
|
-
if state_machine.ready_to_serve_traffic:
|
799
|
-
logger(
|
800
|
-
"💊 %s %s is ready to serve traffic on the URL: %s"
|
801
|
-
% (
|
802
|
-
self.capsule_type,
|
803
|
-
self.identifier,
|
804
|
-
state_machine.out_of_cluster_url,
|
805
|
-
),
|
806
|
-
)
|
807
|
-
|
808
|
-
break
|
809
|
-
|
810
|
-
if CAPSULE_DEBUG and self._debug_dir:
|
811
|
-
state_machine.save_debug_info(self._debug_dir)
|
812
|
-
workers_state_machine.save_debug_info(self._debug_dir)
|
813
|
-
if i % 3 == 0: # Every 3 seconds report the status
|
814
|
-
logger(
|
815
|
-
f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status: {state_machine.current_status} | worker states: {workers_state_machine.current_status}"
|
816
|
-
)
|
817
|
-
|
818
|
-
if not self.status.ready_to_serve_traffic:
|
819
|
-
raise CapsuleDeploymentException(
|
820
|
-
self.identifier,
|
821
|
-
f"Capsule {self.identifier} failed to be ready to serve traffic",
|
822
|
-
)
|
823
|
-
|
824
|
-
if CAPSULE_DEBUG and self._debug_dir:
|
825
|
-
state_machine.save_debug_info(self._debug_dir)
|
826
|
-
workers_state_machine.save_debug_info(self._debug_dir)
|
827
|
-
logger(
|
828
|
-
f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status [on return]: {state_machine.current_status} | worker states: {workers_state_machine.current_status}"
|
829
|
-
)
|
830
|
-
|
831
|
-
return dict(
|
832
|
-
id=self.identifier,
|
833
|
-
auth_type=self.capsule_type,
|
834
|
-
public_url=self.status.out_of_cluster_url,
|
835
|
-
available_replicas=self.status.available_replicas,
|
836
|
-
name=self.name,
|
837
|
-
deployed_version=self.current_deployment_instance_version,
|
838
|
-
deployed_at=datetime.now().isoformat(),
|
839
|
-
)
|