outerbounds 0.3.182rc1__py3-none-any.whl → 0.3.183rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- outerbounds/apps/_state_machine.py +89 -0
- outerbounds/apps/app_cli.py +62 -22
- outerbounds/apps/capsule.py +104 -64
- outerbounds/apps/utils.py +10 -4
- {outerbounds-0.3.182rc1.dist-info → outerbounds-0.3.183rc0.dist-info}/METADATA +4 -4
- {outerbounds-0.3.182rc1.dist-info → outerbounds-0.3.183rc0.dist-info}/RECORD +8 -8
- {outerbounds-0.3.182rc1.dist-info → outerbounds-0.3.183rc0.dist-info}/WHEEL +0 -0
- {outerbounds-0.3.182rc1.dist-info → outerbounds-0.3.183rc0.dist-info}/entry_points.txt +0 -0
@@ -137,6 +137,19 @@ class _capsuleDeployerStateMachine:
|
|
137
137
|
from typing import TypedDict
|
138
138
|
|
139
139
|
|
140
|
+
class AccessInfo(TypedDict):
|
141
|
+
outOfClusterURL: str
|
142
|
+
inClusterURL: str
|
143
|
+
|
144
|
+
|
145
|
+
class CapsuleStatus(TypedDict):
|
146
|
+
availableReplicas: int
|
147
|
+
readyToServeTraffic: bool
|
148
|
+
accessInfo: AccessInfo
|
149
|
+
updateInProgress: bool
|
150
|
+
currentlyServedVersion: str
|
151
|
+
|
152
|
+
|
140
153
|
class WorkerStatus(TypedDict):
|
141
154
|
workerId: str
|
142
155
|
phase: str
|
@@ -196,6 +209,82 @@ class DEPLOYMENT_READY_CONDITIONS:
|
|
196
209
|
# `ASYNC` implies that the deployment will be assumed ready after the URL is minted and the worker statuses are not checked.
|
197
210
|
ASYNC = "async"
|
198
211
|
|
212
|
+
@classmethod
|
213
|
+
def check_failure_condition(
|
214
|
+
cls,
|
215
|
+
capsule_status: CapsuleStatus,
|
216
|
+
worker_semantic_status: "CapsuleWorkerSemanticStatus",
|
217
|
+
) -> bool:
|
218
|
+
"""
|
219
|
+
Check if the deployment has failed based on the current capsule and worker status.
|
220
|
+
"""
|
221
|
+
return worker_semantic_status["status"]["at_least_one_crashlooping"]
|
222
|
+
|
223
|
+
@classmethod
|
224
|
+
def check_readiness_condition(
|
225
|
+
cls,
|
226
|
+
capsule_status: CapsuleStatus,
|
227
|
+
worker_semantic_status: "CapsuleWorkerSemanticStatus",
|
228
|
+
readiness_condition: str,
|
229
|
+
) -> Tuple[bool, bool]:
|
230
|
+
"""
|
231
|
+
Check if the deployment readiness condition is satisfied based on current capsule and worker status.
|
232
|
+
|
233
|
+
This method evaluates whether a deployment has reached its desired ready state according to
|
234
|
+
the specified readiness condition. Different conditions have different criteria for what
|
235
|
+
constitutes a "ready" deployment.
|
236
|
+
|
237
|
+
Parameters
|
238
|
+
----------
|
239
|
+
capsule_status : CapsuleStatus
|
240
|
+
The current status of the capsule deployment, including update progress information.
|
241
|
+
worker_semantic_status : CapsuleWorkerSemanticStatus
|
242
|
+
Semantic status information about the workers, including counts and states.
|
243
|
+
readiness_condition : str
|
244
|
+
The readiness condition to evaluate. Must be one of the class constants:
|
245
|
+
- ATLEAST_ONE_RUNNING: At least one worker is running and update is not in progress
|
246
|
+
- ALL_RUNNING: All required workers are running and update is not in progress
|
247
|
+
- FULLY_FINISHED: All workers running with no pending/crashlooping workers and update is not in progress
|
248
|
+
- ASYNC: Deployment ready when update is no longer in progress
|
249
|
+
|
250
|
+
Returns
|
251
|
+
-------
|
252
|
+
Tuple[bool, bool]
|
253
|
+
A tuple containing:
|
254
|
+
- First element: Boolean indicating if the readiness condition is satisfied
|
255
|
+
- Second element: Boolean indicating if additional worker readiness checks
|
256
|
+
should be performed (False for ASYNC mode, True for all others)
|
257
|
+
|
258
|
+
Raises
|
259
|
+
------
|
260
|
+
ValueError
|
261
|
+
If an invalid readiness condition is provided.
|
262
|
+
"""
|
263
|
+
_worker_readiness_check = True
|
264
|
+
_readiness_condition_satisfied = False
|
265
|
+
if readiness_condition == cls.ATLEAST_ONE_RUNNING:
|
266
|
+
_readiness_condition_satisfied = (
|
267
|
+
worker_semantic_status["status"]["at_least_one_running"]
|
268
|
+
and not capsule_status["updateInProgress"]
|
269
|
+
)
|
270
|
+
elif readiness_condition == cls.ALL_RUNNING:
|
271
|
+
_readiness_condition_satisfied = (
|
272
|
+
worker_semantic_status["status"]["all_running"]
|
273
|
+
and not capsule_status["updateInProgress"]
|
274
|
+
)
|
275
|
+
elif readiness_condition == cls.FULLY_FINISHED:
|
276
|
+
_readiness_condition_satisfied = (
|
277
|
+
worker_semantic_status["status"]["fully_finished"]
|
278
|
+
and not capsule_status["updateInProgress"]
|
279
|
+
)
|
280
|
+
elif readiness_condition == cls.ASYNC:
|
281
|
+
_readiness_condition_satisfied = not capsule_status["updateInProgress"]
|
282
|
+
_worker_readiness_check = False
|
283
|
+
else:
|
284
|
+
raise ValueError(f"Invalid readiness condition: {readiness_condition}")
|
285
|
+
|
286
|
+
return _readiness_condition_satisfied, _worker_readiness_check
|
287
|
+
|
199
288
|
@classmethod
|
200
289
|
def docstring(cls):
|
201
290
|
return cls.__doc__
|
outerbounds/apps/app_cli.py
CHANGED
@@ -223,6 +223,7 @@ class ColorTheme:
|
|
223
223
|
LOADING_COLOR = "cyan"
|
224
224
|
BAD_COLOR = "red"
|
225
225
|
INFO_COLOR = "green"
|
226
|
+
DEBUG_COLOR = "yellow"
|
226
227
|
|
227
228
|
TL_HEADER_COLOR = "magenta"
|
228
229
|
ROW_COLOR = "bright_white"
|
@@ -439,6 +440,12 @@ def deployment_instance_options(func):
|
|
439
440
|
help=DEPLOYMENT_READY_CONDITIONS.__doc__,
|
440
441
|
default=DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING,
|
441
442
|
)
|
443
|
+
@click.option(
|
444
|
+
"--status-file",
|
445
|
+
type=str,
|
446
|
+
help="The path to the file where the final status of the deployment will be written.",
|
447
|
+
default=None,
|
448
|
+
)
|
442
449
|
@click.option(
|
443
450
|
"--readiness-wait-time",
|
444
451
|
type=int,
|
@@ -446,9 +453,10 @@ def deployment_instance_options(func):
|
|
446
453
|
default=4,
|
447
454
|
)
|
448
455
|
@click.option(
|
449
|
-
"--
|
456
|
+
"--deployment-timeout",
|
457
|
+
"max_wait_time",
|
450
458
|
type=int,
|
451
|
-
help="The maximum time (in seconds) to wait for the deployment to
|
459
|
+
help="The maximum time (in seconds) to wait for the deployment to reach readiness before timing out.",
|
452
460
|
default=600,
|
453
461
|
)
|
454
462
|
@click.option(
|
@@ -702,6 +710,7 @@ def deploy(
|
|
702
710
|
readiness_condition=None,
|
703
711
|
max_wait_time=None,
|
704
712
|
readiness_wait_time=None,
|
713
|
+
status_file=None,
|
705
714
|
no_loader=False,
|
706
715
|
**options,
|
707
716
|
):
|
@@ -801,9 +810,9 @@ def deploy(
|
|
801
810
|
ctx.obj.app_state_dir, app_config.get("name", "default")
|
802
811
|
)
|
803
812
|
|
804
|
-
def _non_spinner_logger(*msg):
|
813
|
+
def _non_spinner_logger(*msg, **kwargs):
|
805
814
|
for m in msg:
|
806
|
-
logger(m)
|
815
|
+
logger(m, **kwargs)
|
807
816
|
|
808
817
|
deploy_validations(
|
809
818
|
app_config,
|
@@ -839,14 +848,34 @@ def deploy(
|
|
839
848
|
|
840
849
|
app_config.set_state("perimeter", ctx.obj.perimeter)
|
841
850
|
|
851
|
+
capsule_spinner = None
|
852
|
+
capsule_logger = _non_spinner_logger
|
853
|
+
if not no_loader:
|
854
|
+
capsule_spinner = MultiStepSpinner(
|
855
|
+
text=lambda: _logger_styled(
|
856
|
+
"💊 Waiting for %s %s to be ready to serve traffic"
|
857
|
+
% (capsule.capsule_type.lower(), capsule.identifier),
|
858
|
+
timestamp=True,
|
859
|
+
),
|
860
|
+
color=ColorTheme.LOADING_COLOR,
|
861
|
+
)
|
862
|
+
capsule_logger = partial(_spinner_logger, capsule_spinner)
|
863
|
+
capsule_spinner.start()
|
864
|
+
|
865
|
+
_current_instance_debug_dir = os.path.join(
|
866
|
+
cache_dir, f"debug_deployment_instance_{time.time()}"
|
867
|
+
)
|
868
|
+
if CAPSULE_DEBUG:
|
869
|
+
os.makedirs(_current_instance_debug_dir, exist_ok=True)
|
842
870
|
# 2. Convert to the IR that the backend accepts
|
843
871
|
capsule = CapsuleDeployer(
|
844
872
|
app_config,
|
845
873
|
ctx.obj.api_url,
|
846
|
-
debug_dir=
|
874
|
+
debug_dir=_current_instance_debug_dir,
|
847
875
|
success_terminal_state_condition=readiness_condition,
|
848
876
|
create_timeout=max_wait_time,
|
849
877
|
readiness_wait_time=readiness_wait_time,
|
878
|
+
logger_fn=capsule_logger,
|
850
879
|
)
|
851
880
|
currently_present_capsules = list_and_filter_capsules(
|
852
881
|
capsule.capsule_api,
|
@@ -879,13 +908,13 @@ def deploy(
|
|
879
908
|
"If you wish to force upgrade, you can do so by providing the `--force-upgrade` flag."
|
880
909
|
)
|
881
910
|
raise AppConfigError(message)
|
882
|
-
|
911
|
+
capsule_logger(
|
883
912
|
f"🚀 {'' if not force_upgrade else 'Force'} Upgrading {capsule.capsule_type.lower()} `{capsule.name}`....",
|
884
913
|
color=ColorTheme.INFO_COLOR,
|
885
914
|
system_msg=True,
|
886
915
|
)
|
887
916
|
else:
|
888
|
-
|
917
|
+
capsule_logger(
|
889
918
|
f"🚀 Deploying {capsule.capsule_type.lower()} to the platform....",
|
890
919
|
color=ColorTheme.INFO_COLOR,
|
891
920
|
system_msg=True,
|
@@ -894,21 +923,9 @@ def deploy(
|
|
894
923
|
capsule.create()
|
895
924
|
_post_create_debug(capsule, cache_dir)
|
896
925
|
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
capsule_spinner = MultiStepSpinner(
|
901
|
-
text=lambda: _logger_styled(
|
902
|
-
"💊 Waiting for %s %s to be ready to serve traffic"
|
903
|
-
% (capsule.capsule_type.lower(), capsule.identifier),
|
904
|
-
timestamp=True,
|
905
|
-
),
|
906
|
-
color=ColorTheme.LOADING_COLOR,
|
907
|
-
)
|
908
|
-
capsule_logger = partial(_spinner_logger, capsule_spinner)
|
909
|
-
capsule_spinner.start()
|
910
|
-
|
911
|
-
capsule.wait_for_terminal_state(logger=capsule_logger)
|
926
|
+
# We only get the `capsule_response` if the deployment is has reached
|
927
|
+
# a successful terminal state.
|
928
|
+
final_status = capsule.wait_for_terminal_state()
|
912
929
|
if capsule_spinner:
|
913
930
|
capsule_spinner.stop()
|
914
931
|
|
@@ -918,12 +935,35 @@ def deploy(
|
|
918
935
|
system_msg=True,
|
919
936
|
)
|
920
937
|
|
938
|
+
if CAPSULE_DEBUG:
|
939
|
+
logger(
|
940
|
+
f"[debug] 💊 {capsule.capsule_type} {app_config.config['name']} ({capsule.identifier}) deployment status [on completion]: {final_status}",
|
941
|
+
color=ColorTheme.DEBUG_COLOR,
|
942
|
+
)
|
943
|
+
logger(
|
944
|
+
f"[debug] 💊 {capsule.capsule_type} {app_config.config['name']} ({capsule.identifier}) debug info saved to `{_current_instance_debug_dir}`",
|
945
|
+
color=ColorTheme.DEBUG_COLOR,
|
946
|
+
)
|
947
|
+
final_status["debug_dir"] = _current_instance_debug_dir
|
948
|
+
|
949
|
+
if status_file:
|
950
|
+
# Create the file if it doesn't exist
|
951
|
+
with open(status_file, "w") as f:
|
952
|
+
f.write(json.dumps(final_status, indent=4))
|
953
|
+
logger(
|
954
|
+
f"📝 {capsule.capsule_type} {app_config.config['name']} ({capsule.identifier}) deployment status written to {status_file}",
|
955
|
+
color=ColorTheme.INFO_COLOR,
|
956
|
+
system_msg=True,
|
957
|
+
)
|
958
|
+
|
921
959
|
except Exception as e:
|
922
960
|
logger(
|
923
961
|
f"Deployment failed: [{e.__class__.__name__}]: {e}",
|
924
962
|
bad=True,
|
925
963
|
system_msg=True,
|
926
964
|
)
|
965
|
+
if CAPSULE_DEBUG:
|
966
|
+
raise e
|
927
967
|
exit(1)
|
928
968
|
|
929
969
|
|
outerbounds/apps/capsule.py
CHANGED
@@ -1,8 +1,11 @@
|
|
1
|
+
from datetime import datetime
|
1
2
|
import json
|
2
3
|
import os
|
3
4
|
import pathlib
|
4
5
|
import requests
|
6
|
+
import sys
|
5
7
|
import time
|
8
|
+
from functools import partial
|
6
9
|
import shlex
|
7
10
|
from typing import Optional, List, Dict, Any, Tuple, Union
|
8
11
|
from .utils import TODOException, safe_requests_wrapper, MaximumRetriesExceeded
|
@@ -13,6 +16,7 @@ from ._state_machine import (
|
|
13
16
|
_capsule_worker_status_diff,
|
14
17
|
CapsuleWorkerSemanticStatus,
|
15
18
|
WorkerStatus,
|
19
|
+
CapsuleStatus,
|
16
20
|
DEPLOYMENT_READY_CONDITIONS,
|
17
21
|
)
|
18
22
|
|
@@ -59,7 +63,7 @@ class CapsuleStateMachine:
|
|
59
63
|
def get_status_trail(self):
|
60
64
|
return self._status_trail
|
61
65
|
|
62
|
-
def add_status(self, status:
|
66
|
+
def add_status(self, status: CapsuleStatus):
|
63
67
|
assert type(status) == dict, "TODO: Make this check somewhere else"
|
64
68
|
self._status_trail.append({"timestamp": time.time(), "status": status})
|
65
69
|
|
@@ -106,13 +110,10 @@ class CapsuleStateMachine:
|
|
106
110
|
def report_current_status(self, logger):
|
107
111
|
pass
|
108
112
|
|
109
|
-
def
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
)
|
114
|
-
with open(debug_path, "w") as f:
|
115
|
-
json.dump(self._status_trail, f, indent=4)
|
113
|
+
def save_debug_info(self, state_dir: str):
|
114
|
+
debug_path = os.path.join(state_dir, f"debug_capsule_{self._capsule_id}.json")
|
115
|
+
with open(debug_path, "w") as f:
|
116
|
+
json.dump(self._status_trail, f, indent=4)
|
116
117
|
|
117
118
|
|
118
119
|
class CapsuleWorkersStateMachine:
|
@@ -156,19 +157,18 @@ class CapsuleWorkersStateMachine:
|
|
156
157
|
{"timestamp": time.time(), "status": worker_list_response}
|
157
158
|
)
|
158
159
|
|
159
|
-
def
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
json.dump(self._status_trail, f, indent=4)
|
160
|
+
def save_debug_info(self, state_dir: str):
|
161
|
+
debug_path = os.path.join(
|
162
|
+
state_dir, f"debug_capsule_workers_{self._capsule_id}_trail.json"
|
163
|
+
)
|
164
|
+
with open(debug_path, "w") as f:
|
165
|
+
json.dump(self._status_trail, f, indent=4)
|
166
166
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
167
|
+
status_path = os.path.join(
|
168
|
+
state_dir, f"debug_capsule_workers_{self._capsule_id}_status.json"
|
169
|
+
)
|
170
|
+
with open(status_path, "w") as f:
|
171
|
+
json.dump(self.current_version_deployment_status(), f, indent=4)
|
172
172
|
|
173
173
|
def report_current_status(self, logger):
|
174
174
|
if len(self._status_trail) == 0:
|
@@ -195,29 +195,7 @@ class CapsuleWorkersStateMachine:
|
|
195
195
|
)
|
196
196
|
|
197
197
|
@property
|
198
|
-
def
|
199
|
-
return any(
|
200
|
-
s is not None for s in [self.is_success_state, self.is_failure_state]
|
201
|
-
)
|
202
|
-
|
203
|
-
@property
|
204
|
-
def is_success_state(self):
|
205
|
-
status = self.current_version_deployment_status()
|
206
|
-
if self._deployment_mode == DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING:
|
207
|
-
return status["status"]["at_least_one_running"]
|
208
|
-
elif self._deployment_mode == DEPLOYMENT_READY_CONDITIONS.ALL_RUNNING:
|
209
|
-
return status["status"]["all_running"]
|
210
|
-
elif self._deployment_mode == DEPLOYMENT_READY_CONDITIONS.FULLY_FINISHED:
|
211
|
-
return (
|
212
|
-
status["status"]["current_info"]["running"] == self._minimum_replicas
|
213
|
-
and status["status"]["current_info"]["pending"] == 0
|
214
|
-
and status["status"]["current_info"]["crashlooping"] == 0
|
215
|
-
)
|
216
|
-
else:
|
217
|
-
raise ValueError(f"Unknown deployment mode: {self._deployment_mode}")
|
218
|
-
|
219
|
-
@property
|
220
|
-
def is_failure_state(self):
|
198
|
+
def is_crashlooping(self) -> bool:
|
221
199
|
status = self.current_version_deployment_status()
|
222
200
|
return status["status"]["at_least_one_crashlooping"]
|
223
201
|
|
@@ -351,19 +329,24 @@ class CapsuleApiException(Exception):
|
|
351
329
|
|
352
330
|
|
353
331
|
class CapsuleDeploymentException(Exception):
|
354
|
-
def __init__(
|
332
|
+
def __init__(
|
333
|
+
self,
|
334
|
+
capsule_id: str,
|
335
|
+
message: str,
|
336
|
+
):
|
355
337
|
self.capsule_id = capsule_id
|
356
338
|
self.message = message
|
357
339
|
|
358
340
|
def __str__(self):
|
359
|
-
return f"CapsuleDeploymentException: {self.capsule_id}
|
341
|
+
return f"CapsuleDeploymentException: [{self.capsule_id}] :: {self.message}"
|
360
342
|
|
361
343
|
|
362
344
|
class CapsuleApi:
|
363
|
-
def __init__(self, base_url: str, perimeter: str):
|
345
|
+
def __init__(self, base_url: str, perimeter: str, logger_fn=None):
|
364
346
|
self._base_url = self._create_base_url(base_url, perimeter)
|
365
347
|
from metaflow.metaflow_config import SERVICE_HEADERS
|
366
348
|
|
349
|
+
self._logger_fn = logger_fn
|
367
350
|
self._request_headers = {
|
368
351
|
**{"Content-Type": "application/json", "Connection": "keep-alive"},
|
369
352
|
**(SERVICE_HEADERS or {}),
|
@@ -385,6 +368,7 @@ class CapsuleApi:
|
|
385
368
|
method_func,
|
386
369
|
*args,
|
387
370
|
headers=self._request_headers,
|
371
|
+
logger_fn=self._logger_fn,
|
388
372
|
**kwargs,
|
389
373
|
)
|
390
374
|
except MaximumRetriesExceeded as e:
|
@@ -599,10 +583,16 @@ class CapsuleDeployer:
|
|
599
583
|
debug_dir: Optional[str] = None,
|
600
584
|
success_terminal_state_condition: str = DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING,
|
601
585
|
readiness_wait_time: int = 20,
|
586
|
+
logger_fn=None,
|
602
587
|
):
|
603
588
|
self._app_config = app_config
|
604
|
-
self._capsule_api = CapsuleApi(
|
589
|
+
self._capsule_api = CapsuleApi(
|
590
|
+
base_url,
|
591
|
+
app_config.get_state("perimeter"),
|
592
|
+
logger_fn=logger_fn or partial(print, file=sys.stderr),
|
593
|
+
)
|
605
594
|
self._create_timeout = create_timeout
|
595
|
+
self._logger_fn = logger_fn
|
606
596
|
self._debug_dir = debug_dir
|
607
597
|
self._capsule_deploy_response = None
|
608
598
|
self._success_terminal_state_condition = success_terminal_state_condition
|
@@ -668,14 +658,19 @@ class CapsuleDeployer:
|
|
668
658
|
)
|
669
659
|
|
670
660
|
def _monitor_worker_readiness(
|
671
|
-
self,
|
661
|
+
self,
|
662
|
+
workers_sm: "CapsuleWorkersStateMachine",
|
672
663
|
):
|
673
|
-
""" """
|
664
|
+
"""returns True if the worker is crashlooping, False otherwise"""
|
665
|
+
logger = self._logger_fn or partial(print, file=sys.stderr)
|
674
666
|
for i in range(self._readiness_wait_time):
|
675
667
|
time.sleep(1)
|
676
668
|
workers_response = self.get_workers()
|
677
669
|
workers_sm.add_status(workers_response)
|
678
670
|
workers_sm.report_current_status(logger)
|
671
|
+
if workers_sm.is_crashlooping:
|
672
|
+
return True
|
673
|
+
return False
|
679
674
|
|
680
675
|
def _extract_logs_from_crashlooping_worker(
|
681
676
|
self, workers_sm: "CapsuleWorkersStateMachine"
|
@@ -696,13 +691,14 @@ class CapsuleDeployer:
|
|
696
691
|
|
697
692
|
def wait_for_terminal_state(
|
698
693
|
self,
|
699
|
-
logger=print,
|
700
694
|
):
|
701
695
|
""" """
|
696
|
+
logger = self._logger_fn or partial(print, file=sys.stderr)
|
702
697
|
state_machine = CapsuleStateMachine(
|
703
698
|
self.identifier, self.current_deployment_instance_version
|
704
699
|
)
|
705
|
-
min_replicas
|
700
|
+
# min_replicas will always be present
|
701
|
+
min_replicas = self._app_config.get_state("replicas", {}).get("min")
|
706
702
|
workers_state_machine = CapsuleWorkersStateMachine(
|
707
703
|
self.identifier,
|
708
704
|
self.current_deployment_instance_version,
|
@@ -729,10 +725,22 @@ class CapsuleDeployer:
|
|
729
725
|
# of the workerstate machine. If we detect a terminal state in the workers,
|
730
726
|
# then even if the capsule upgrade is still in progress we will end up crashing
|
731
727
|
# the deployment.
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
)
|
728
|
+
(
|
729
|
+
capsule_ready,
|
730
|
+
further_check_worker_readiness,
|
731
|
+
) = DEPLOYMENT_READY_CONDITIONS.check_readiness_condition(
|
732
|
+
state_machine.current_status,
|
733
|
+
workers_state_machine.current_version_deployment_status(),
|
734
|
+
self._success_terminal_state_condition,
|
735
|
+
)
|
736
|
+
|
737
|
+
failure_condition_satisfied = (
|
738
|
+
DEPLOYMENT_READY_CONDITIONS.check_failure_condition(
|
739
|
+
state_machine.current_status,
|
740
|
+
workers_state_machine.current_version_deployment_status(),
|
741
|
+
)
|
742
|
+
)
|
743
|
+
if capsule_ready or failure_condition_satisfied:
|
736
744
|
logger(
|
737
745
|
"💊 %s deployment status: %s | worker states: [success :%s | failure :%s ] "
|
738
746
|
% (
|
@@ -740,19 +748,31 @@ class CapsuleDeployer:
|
|
740
748
|
"in progress"
|
741
749
|
if state_machine.update_in_progress
|
742
750
|
else "completed",
|
743
|
-
|
744
|
-
|
751
|
+
capsule_ready,
|
752
|
+
failure_condition_satisfied,
|
745
753
|
)
|
746
754
|
)
|
747
|
-
|
755
|
+
_further_readiness_check_failed = False
|
756
|
+
if further_check_worker_readiness:
|
748
757
|
# HACK : monitor the workers for N seconds to make sure they are healthy
|
749
758
|
# this is a hack. Ideally we should implment a healtcheck as a first class citizen
|
750
759
|
# but it will take some time to do that so in the meanwhile a timeout set on the cli
|
751
760
|
# side will be really helpful.
|
752
|
-
|
761
|
+
logger(
|
762
|
+
"💊 running last minute readiness check for %s..."
|
763
|
+
% self.identifier
|
764
|
+
)
|
765
|
+
_further_readiness_check_failed = self._monitor_worker_readiness(
|
766
|
+
workers_state_machine
|
767
|
+
)
|
768
|
+
|
769
|
+
if CAPSULE_DEBUG:
|
770
|
+
logger(
|
771
|
+
f"[debug] 💊 {self.capsule_type} {self.identifier}: further_check_worker_readiness {_further_readiness_check_failed} | failure_condition_satisfied {failure_condition_satisfied}"
|
772
|
+
)
|
753
773
|
|
754
774
|
# We should still check for failure state and crash if we detect something in the readiness check
|
755
|
-
if
|
775
|
+
if failure_condition_satisfied or _further_readiness_check_failed:
|
756
776
|
# hit the logs endpoint for the worker and get the logs
|
757
777
|
# Print those logs out on the terminal
|
758
778
|
# raise an exception that should be caught gracefully by the cli
|
@@ -787,13 +807,33 @@ class CapsuleDeployer:
|
|
787
807
|
|
788
808
|
break
|
789
809
|
|
790
|
-
if self._debug_dir:
|
791
|
-
state_machine.
|
792
|
-
workers_state_machine.
|
810
|
+
if CAPSULE_DEBUG and self._debug_dir:
|
811
|
+
state_machine.save_debug_info(self._debug_dir)
|
812
|
+
workers_state_machine.save_debug_info(self._debug_dir)
|
813
|
+
if i % 3 == 0: # Every 3 seconds report the status
|
814
|
+
logger(
|
815
|
+
f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status: {state_machine.current_status} | worker states: {workers_state_machine.current_status}"
|
816
|
+
)
|
793
817
|
|
794
818
|
if not self.status.ready_to_serve_traffic:
|
795
819
|
raise CapsuleDeploymentException(
|
796
820
|
self.identifier,
|
797
821
|
f"Capsule {self.identifier} failed to be ready to serve traffic",
|
798
822
|
)
|
799
|
-
|
823
|
+
|
824
|
+
if CAPSULE_DEBUG and self._debug_dir:
|
825
|
+
state_machine.save_debug_info(self._debug_dir)
|
826
|
+
workers_state_machine.save_debug_info(self._debug_dir)
|
827
|
+
logger(
|
828
|
+
f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status [on return]: {state_machine.current_status} | worker states: {workers_state_machine.current_status}"
|
829
|
+
)
|
830
|
+
|
831
|
+
return dict(
|
832
|
+
id=self.identifier,
|
833
|
+
auth_type=self.capsule_type,
|
834
|
+
public_url=self.status.out_of_cluster_url,
|
835
|
+
available_replicas=self.status.available_replicas,
|
836
|
+
name=self.name,
|
837
|
+
deployed_version=self.current_deployment_instance_version,
|
838
|
+
deployed_at=datetime.now().isoformat(),
|
839
|
+
)
|
outerbounds/apps/utils.py
CHANGED
@@ -176,6 +176,7 @@ def safe_requests_wrapper(
|
|
176
176
|
*args,
|
177
177
|
conn_error_retries=2,
|
178
178
|
retryable_status_codes=[409],
|
179
|
+
logger_fn=None,
|
179
180
|
**kwargs,
|
180
181
|
):
|
181
182
|
"""
|
@@ -206,10 +207,15 @@ def safe_requests_wrapper(
|
|
206
207
|
if response.status_code not in retryable_status_codes:
|
207
208
|
return response
|
208
209
|
if CAPSULE_DEBUG:
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
210
|
+
if logger_fn:
|
211
|
+
logger_fn(
|
212
|
+
f"[outerbounds-debug] safe_requests_wrapper: {response.url}[{requests_module_fn.__name__}] {response.status_code} {response.text}",
|
213
|
+
)
|
214
|
+
else:
|
215
|
+
print(
|
216
|
+
f"[outerbounds-debug] safe_requests_wrapper: {response.url}[{requests_module_fn.__name__}] {response.status_code} {response.text}",
|
217
|
+
file=sys.stderr,
|
218
|
+
)
|
213
219
|
_num_retries += 1
|
214
220
|
time.sleep((2 ** (_num_retries + 1)) + noise)
|
215
221
|
except requests.exceptions.ConnectionError:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: outerbounds
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.183rc0
|
4
4
|
Summary: More Data Science, Less Administration
|
5
5
|
License: Proprietary
|
6
6
|
Keywords: data science,machine learning,MLOps
|
@@ -28,9 +28,9 @@ Requires-Dist: google-auth (>=2.27.0,<3.0.0) ; extra == "gcp"
|
|
28
28
|
Requires-Dist: google-cloud-secret-manager (>=2.20.0,<3.0.0) ; extra == "gcp"
|
29
29
|
Requires-Dist: google-cloud-storage (>=2.14.0,<3.0.0) ; extra == "gcp"
|
30
30
|
Requires-Dist: metaflow-checkpoint (==0.2.1)
|
31
|
-
Requires-Dist: ob-metaflow (==2.15.
|
32
|
-
Requires-Dist: ob-metaflow-extensions (==1.1.
|
33
|
-
Requires-Dist: ob-metaflow-stubs (==6.0.3.
|
31
|
+
Requires-Dist: ob-metaflow (==2.15.18.1)
|
32
|
+
Requires-Dist: ob-metaflow-extensions (==1.1.171rc0)
|
33
|
+
Requires-Dist: ob-metaflow-stubs (==6.0.3.183rc0)
|
34
34
|
Requires-Dist: opentelemetry-distro (>=0.41b0) ; extra == "otel"
|
35
35
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http (>=1.20.0) ; extra == "otel"
|
36
36
|
Requires-Dist: opentelemetry-instrumentation-requests (>=0.41b0) ; extra == "otel"
|
@@ -43,11 +43,11 @@ outerbounds/_vendor/yaml/scanner.py,sha256=ZcI8IngR56PaQ0m27WU2vxCqmDCuRjz-hr7pi
|
|
43
43
|
outerbounds/_vendor/yaml/serializer.py,sha256=8wFZRy9SsQSktF_f9OOroroqsh4qVUe53ry07P9UgCc,4368
|
44
44
|
outerbounds/_vendor/yaml/tokens.py,sha256=JBSu38wihGr4l73JwbfMA7Ks1-X84g8-NskTz7KwPmA,2578
|
45
45
|
outerbounds/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
46
|
-
outerbounds/apps/_state_machine.py,sha256=
|
47
|
-
outerbounds/apps/app_cli.py,sha256=
|
46
|
+
outerbounds/apps/_state_machine.py,sha256=PaegyxSxNZxyLTxU9_kekd3MPM9sW76RZPkibeMTMfY,18314
|
47
|
+
outerbounds/apps/app_cli.py,sha256=n_NABDjdgY4ApgNvdQMmpJfPGzCwJxr_G0w6-5LZ85I,51940
|
48
48
|
outerbounds/apps/app_config.py,sha256=UHVK8JLIuW-OcGg5WxDm4QHeImPGtohD4KpJryZntC4,11307
|
49
49
|
outerbounds/apps/artifacts.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
|
-
outerbounds/apps/capsule.py,sha256=
|
50
|
+
outerbounds/apps/capsule.py,sha256=NC9ajD06y6U-COi-8Qw6k_N1ltbQAio2O_Xs2RTrAVA,32857
|
51
51
|
outerbounds/apps/cli_to_config.py,sha256=Thc5jXRxoU6Pr8kAVVOX-5Es5ha6y6Vh_GBzL__oI7Q,3299
|
52
52
|
outerbounds/apps/click_importer.py,sha256=nnkPOR6TKrtIpc3a5Fna1zVJoQqDZvUXlNA9CdiNKFc,995
|
53
53
|
outerbounds/apps/code_package/__init__.py,sha256=8McF7pgx8ghvjRnazp2Qktlxi9yYwNiwESSQrk-2oW8,68
|
@@ -59,7 +59,7 @@ outerbounds/apps/deployer.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
59
59
|
outerbounds/apps/experimental/__init__.py,sha256=RUZBAyqFnX3pRQxTjNmS1-qpgQcc9xQGQD2yJh4MA_M,3349
|
60
60
|
outerbounds/apps/perimeters.py,sha256=1J1_-5legFPskv3HTRwQMpzTytE3TO8KRT2IvVOrWcQ,1584
|
61
61
|
outerbounds/apps/secrets.py,sha256=aWzcAayQEJghQgFP_qp9w6jyvan_hoL4_ceqZ0ZjLd4,6126
|
62
|
-
outerbounds/apps/utils.py,sha256=
|
62
|
+
outerbounds/apps/utils.py,sha256=C-4GLU5GHwwWHbW962Qac-wecvtdiBXezq0c8i9aJvs,7908
|
63
63
|
outerbounds/apps/validations.py,sha256=kR2eXckx0XJ4kUOOLkMRepbTh0INtL1Z8aV4-fZpfc8,678
|
64
64
|
outerbounds/cli_main.py,sha256=e9UMnPysmc7gbrimq2I4KfltggyU7pw59Cn9aEguVcU,74
|
65
65
|
outerbounds/command_groups/__init__.py,sha256=QPWtj5wDRTINDxVUL7XPqG3HoxHNvYOg08EnuSZB2Hc,21
|
@@ -78,7 +78,7 @@ outerbounds/utils/metaflowconfig.py,sha256=l2vJbgPkLISU-XPGZFaC8ZKmYFyJemlD6bwB-
|
|
78
78
|
outerbounds/utils/schema.py,sha256=lMUr9kNgn9wy-sO_t_Tlxmbt63yLeN4b0xQXbDUDj4A,2331
|
79
79
|
outerbounds/utils/utils.py,sha256=4Z8cszNob_8kDYCLNTrP-wWads_S_MdL3Uj3ju4mEsk,501
|
80
80
|
outerbounds/vendor.py,sha256=gRLRJNXtZBeUpPEog0LOeIsl6GosaFFbCxUvR4bW6IQ,5093
|
81
|
-
outerbounds-0.3.
|
82
|
-
outerbounds-0.3.
|
83
|
-
outerbounds-0.3.
|
84
|
-
outerbounds-0.3.
|
81
|
+
outerbounds-0.3.183rc0.dist-info/METADATA,sha256=wY0FgePmu807U8X1eBnglevz656zQgPuAjooGlbNpMU,1846
|
82
|
+
outerbounds-0.3.183rc0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
83
|
+
outerbounds-0.3.183rc0.dist-info/entry_points.txt,sha256=AP6rZg7y5SK9e9a9iVq0Fi9Q2KPjPZSwtZ6R98rLw-8,56
|
84
|
+
outerbounds-0.3.183rc0.dist-info/RECORD,,
|
File without changes
|
File without changes
|