outerbounds 0.3.182rc2__py3-none-any.whl → 0.3.183rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -137,6 +137,19 @@ class _capsuleDeployerStateMachine:
137
137
  from typing import TypedDict
138
138
 
139
139
 
140
+ class AccessInfo(TypedDict):
141
+ outOfClusterURL: str
142
+ inClusterURL: str
143
+
144
+
145
+ class CapsuleStatus(TypedDict):
146
+ availableReplicas: int
147
+ readyToServeTraffic: bool
148
+ accessInfo: AccessInfo
149
+ updateInProgress: bool
150
+ currentlyServedVersion: str
151
+
152
+
140
153
  class WorkerStatus(TypedDict):
141
154
  workerId: str
142
155
  phase: str
@@ -196,6 +209,86 @@ class DEPLOYMENT_READY_CONDITIONS:
196
209
  # `ASYNC` implies that the deployment will be assumed ready after the URL is minted and the worker statuses are not checked.
197
210
  ASYNC = "async"
198
211
 
212
+ @classmethod
213
+ def check_failure_condition(
214
+ cls,
215
+ capsule_status: CapsuleStatus,
216
+ worker_semantic_status: "CapsuleWorkerSemanticStatus",
217
+ ) -> bool:
218
+ """
219
+ Check if the deployment has failed based on the current capsule and worker status.
220
+ """
221
+ return worker_semantic_status["status"]["at_least_one_crashlooping"]
222
+
223
+ @classmethod
224
+ def check_readiness_condition(
225
+ cls,
226
+ capsule_status: CapsuleStatus,
227
+ worker_semantic_status: "CapsuleWorkerSemanticStatus",
228
+ readiness_condition: str,
229
+ ) -> Tuple[bool, bool]:
230
+ """
231
+ Check if the deployment readiness condition is satisfied based on current capsule and worker status.
232
+
233
+ This method evaluates whether a deployment has reached its desired ready state according to
234
+ the specified readiness condition. Different conditions have different criteria for what
235
+ constitutes a "ready" deployment.
236
+
237
+ Parameters
238
+ ----------
239
+ capsule_status : CapsuleStatus
240
+ The current status of the capsule deployment, including update progress information.
241
+ worker_semantic_status : CapsuleWorkerSemanticStatus
242
+ Semantic status information about the workers, including counts and states.
243
+ readiness_condition : str
244
+ The readiness condition to evaluate. Must be one of the class constants:
245
+ - ATLEAST_ONE_RUNNING: At least one worker is running and update is not in progress
246
+ - ALL_RUNNING: All required workers are running and update is not in progress
247
+ - FULLY_FINISHED: All workers running with no pending/crashlooping workers and update is not in progress
248
+ - ASYNC: Deployment is ready when the backend registers the current serving version in the capsule's status.
249
+
250
+ Returns
251
+ -------
252
+ Tuple[bool, bool]
253
+ A tuple containing:
254
+ - First element: Boolean indicating if the readiness condition is satisfied
255
+ - Second element: Boolean indicating if additional worker readiness checks
256
+ should be performed (False for ASYNC mode, True for all others)
257
+
258
+ Raises
259
+ ------
260
+ ValueError
261
+ If an invalid readiness condition is provided.
262
+ """
263
+ _worker_readiness_check = True
264
+ _readiness_condition_satisfied = False
265
+ if readiness_condition == cls.ATLEAST_ONE_RUNNING:
266
+ _readiness_condition_satisfied = (
267
+ worker_semantic_status["status"]["at_least_one_running"]
268
+ and not capsule_status["updateInProgress"]
269
+ )
270
+ elif readiness_condition == cls.ALL_RUNNING:
271
+ _readiness_condition_satisfied = (
272
+ worker_semantic_status["status"]["all_running"]
273
+ and not capsule_status["updateInProgress"]
274
+ )
275
+ elif readiness_condition == cls.FULLY_FINISHED:
276
+ _readiness_condition_satisfied = (
277
+ worker_semantic_status["status"]["fully_finished"]
278
+ and not capsule_status["updateInProgress"]
279
+ )
280
+ elif readiness_condition == cls.ASYNC:
281
+ # The async readiness condition is satisfied when the currently served version is the same as the final version.
282
+ _readiness_condition_satisfied = (
283
+ capsule_status["currentlyServedVersion"]
284
+ == worker_semantic_status["final_version"]
285
+ )
286
+ _worker_readiness_check = False
287
+ else:
288
+ raise ValueError(f"Invalid readiness condition: {readiness_condition}")
289
+
290
+ return _readiness_condition_satisfied, _worker_readiness_check
291
+
199
292
  @classmethod
200
293
  def docstring(cls):
201
294
  return cls.__doc__
@@ -206,6 +299,7 @@ class DEPLOYMENT_READY_CONDITIONS:
206
299
  cls.ATLEAST_ONE_RUNNING,
207
300
  cls.ALL_RUNNING,
208
301
  cls.FULLY_FINISHED,
302
+ cls.ASYNC,
209
303
  ]
210
304
 
211
305
 
@@ -21,8 +21,6 @@ from typing import Dict, List, Any, Optional, Union
21
21
  # way to figure the right import of click dynamically. a neat way to handle that would be
22
22
  # to have a function that can import the correct click based on the context in which stuff is being loaded.
23
23
  from .click_importer import click
24
- from outerbounds._vendor import yaml
25
- from outerbounds.utils import metaflowconfig
26
24
  from .app_config import (
27
25
  AppConfig,
28
26
  AppConfigError,
@@ -223,6 +221,7 @@ class ColorTheme:
223
221
  LOADING_COLOR = "cyan"
224
222
  BAD_COLOR = "red"
225
223
  INFO_COLOR = "green"
224
+ DEBUG_COLOR = "yellow"
226
225
 
227
226
  TL_HEADER_COLOR = "magenta"
228
227
  ROW_COLOR = "bright_white"
@@ -298,7 +297,7 @@ def _pre_create_debug(
298
297
  ):
299
298
  if CAPSULE_DEBUG:
300
299
  os.makedirs(state_dir, exist_ok=True)
301
- debug_path = os.path.join(state_dir, f"debug_{time.time()}.yaml")
300
+ debug_path = os.path.join(state_dir, f"debug_{time.time()}.json")
302
301
  with open(
303
302
  debug_path,
304
303
  "w",
@@ -321,14 +320,10 @@ def _pre_create_debug(
321
320
  def _post_create_debug(capsule: CapsuleDeployer, state_dir: str):
322
321
  if CAPSULE_DEBUG:
323
322
  debug_path = os.path.join(
324
- state_dir, f"debug_deploy_response_{time.time()}.yaml"
323
+ state_dir, f"debug_deploy_response_{time.time()}.json"
325
324
  )
326
325
  with open(debug_path, "w") as f:
327
- f.write(
328
- yaml.dump(
329
- capsule._capsule_deploy_response, default_flow_style=False, indent=2
330
- )
331
- )
326
+ f.write(json.dumps(capsule._capsule_deploy_response, indent=2, default=str))
332
327
 
333
328
 
334
329
  def _bake_image(app_config: AppConfig, cache_dir: str, logger):
@@ -718,7 +713,7 @@ def deploy(
718
713
 
719
714
  if not ctx.obj.perimeter:
720
715
  raise AppConfigError("OB_CURRENT_PERIMETER is not set")
721
-
716
+ _current_instance_debug_dir = None
722
717
  logger = partial(_logger, timestamp=True)
723
718
  try:
724
719
  _cli_parsed_config = build_config_from_options(options)
@@ -809,9 +804,9 @@ def deploy(
809
804
  ctx.obj.app_state_dir, app_config.get("name", "default")
810
805
  )
811
806
 
812
- def _non_spinner_logger(*msg):
807
+ def _non_spinner_logger(*msg, **kwargs):
813
808
  for m in msg:
814
- logger(m)
809
+ logger(m, **kwargs)
815
810
 
816
811
  deploy_validations(
817
812
  app_config,
@@ -847,15 +842,35 @@ def deploy(
847
842
 
848
843
  app_config.set_state("perimeter", ctx.obj.perimeter)
849
844
 
845
+ capsule_spinner = None
846
+ capsule_logger = _non_spinner_logger
850
847
  # 2. Convert to the IR that the backend accepts
851
848
  capsule = CapsuleDeployer(
852
849
  app_config,
853
850
  ctx.obj.api_url,
854
- debug_dir=cache_dir,
851
+ debug_dir=_current_instance_debug_dir,
855
852
  success_terminal_state_condition=readiness_condition,
856
853
  create_timeout=max_wait_time,
857
854
  readiness_wait_time=readiness_wait_time,
855
+ logger_fn=capsule_logger,
858
856
  )
857
+ _current_instance_debug_dir = os.path.join(
858
+ cache_dir, f"debug_deployment_instance_{time.time()}"
859
+ )
860
+ if CAPSULE_DEBUG:
861
+ os.makedirs(_current_instance_debug_dir, exist_ok=True)
862
+ if not no_loader:
863
+ capsule_spinner = MultiStepSpinner(
864
+ text=lambda: _logger_styled(
865
+ "💊 Waiting for %s %s to be ready to serve traffic"
866
+ % (capsule.capsule_type.lower(), capsule.identifier),
867
+ timestamp=True,
868
+ ),
869
+ color=ColorTheme.LOADING_COLOR,
870
+ )
871
+ capsule_logger = partial(_spinner_logger, capsule_spinner)
872
+ capsule_spinner.start()
873
+
859
874
  currently_present_capsules = list_and_filter_capsules(
860
875
  capsule.capsule_api,
861
876
  None,
@@ -887,13 +902,13 @@ def deploy(
887
902
  "If you wish to force upgrade, you can do so by providing the `--force-upgrade` flag."
888
903
  )
889
904
  raise AppConfigError(message)
890
- logger(
905
+ capsule_logger(
891
906
  f"🚀 {'' if not force_upgrade else 'Force'} Upgrading {capsule.capsule_type.lower()} `{capsule.name}`....",
892
907
  color=ColorTheme.INFO_COLOR,
893
908
  system_msg=True,
894
909
  )
895
910
  else:
896
- logger(
911
+ capsule_logger(
897
912
  f"🚀 Deploying {capsule.capsule_type.lower()} to the platform....",
898
913
  color=ColorTheme.INFO_COLOR,
899
914
  system_msg=True,
@@ -902,23 +917,9 @@ def deploy(
902
917
  capsule.create()
903
918
  _post_create_debug(capsule, cache_dir)
904
919
 
905
- capsule_spinner = None
906
- capsule_logger = _non_spinner_logger
907
- if not no_loader:
908
- capsule_spinner = MultiStepSpinner(
909
- text=lambda: _logger_styled(
910
- "💊 Waiting for %s %s to be ready to serve traffic"
911
- % (capsule.capsule_type.lower(), capsule.identifier),
912
- timestamp=True,
913
- ),
914
- color=ColorTheme.LOADING_COLOR,
915
- )
916
- capsule_logger = partial(_spinner_logger, capsule_spinner)
917
- capsule_spinner.start()
918
-
919
920
  # We only get the `capsule_response` if the deployment is has reached
920
921
  # a successful terminal state.
921
- final_status = capsule.wait_for_terminal_state(logger=capsule_logger)
922
+ final_status = capsule.wait_for_terminal_state()
922
923
  if capsule_spinner:
923
924
  capsule_spinner.stop()
924
925
 
@@ -928,6 +929,17 @@ def deploy(
928
929
  system_msg=True,
929
930
  )
930
931
 
932
+ if CAPSULE_DEBUG:
933
+ logger(
934
+ f"[debug] 💊 {capsule.capsule_type} {app_config.config['name']} ({capsule.identifier}) deployment status [on completion]: {final_status}",
935
+ color=ColorTheme.DEBUG_COLOR,
936
+ )
937
+ logger(
938
+ f"[debug] 💊 {capsule.capsule_type} {app_config.config['name']} ({capsule.identifier}) debug info saved to `{_current_instance_debug_dir}`",
939
+ color=ColorTheme.DEBUG_COLOR,
940
+ )
941
+ final_status["debug_dir"] = _current_instance_debug_dir
942
+
931
943
  if status_file:
932
944
  # Create the file if it doesn't exist
933
945
  with open(status_file, "w") as f:
@@ -939,11 +951,19 @@ def deploy(
939
951
  )
940
952
 
941
953
  except Exception as e:
954
+ message = getattr(e, "message", str(e))
942
955
  logger(
943
- f"Deployment failed: [{e.__class__.__name__}]: {e}",
956
+ f"Deployment failed: [{e.__class__.__name__}]: {message}",
944
957
  bad=True,
945
958
  system_msg=True,
946
959
  )
960
+ if CAPSULE_DEBUG:
961
+ if _current_instance_debug_dir is not None:
962
+ logger(
963
+ f"[debug] 💊 debug info saved to `{_current_instance_debug_dir}`",
964
+ color=ColorTheme.DEBUG_COLOR,
965
+ )
966
+ raise e
947
967
  exit(1)
948
968
 
949
969
 
@@ -1,5 +1,7 @@
1
1
  import json
2
2
  import os
3
+
4
+ # TODO: remove vendor'd dependency where.
3
5
  from outerbounds._vendor import yaml
4
6
  from typing import Dict, Any
5
7
  from .cli_to_config import build_config_from_options
@@ -58,6 +60,7 @@ class AppConfig:
58
60
  @staticmethod
59
61
  def _load_schema():
60
62
  """Load the configuration schema from the YAML file."""
63
+ # TODO: Make it easier.
61
64
  schema_path = os.path.join(os.path.dirname(__file__), "config_schema.yaml")
62
65
  with open(schema_path, "r") as f:
63
66
  return yaml.safe_load(f)
@@ -3,7 +3,9 @@ import json
3
3
  import os
4
4
  import pathlib
5
5
  import requests
6
+ import sys
6
7
  import time
8
+ from functools import partial
7
9
  import shlex
8
10
  from typing import Optional, List, Dict, Any, Tuple, Union
9
11
  from .utils import TODOException, safe_requests_wrapper, MaximumRetriesExceeded
@@ -14,6 +16,7 @@ from ._state_machine import (
14
16
  _capsule_worker_status_diff,
15
17
  CapsuleWorkerSemanticStatus,
16
18
  WorkerStatus,
19
+ CapsuleStatus,
17
20
  DEPLOYMENT_READY_CONDITIONS,
18
21
  )
19
22
 
@@ -60,7 +63,7 @@ class CapsuleStateMachine:
60
63
  def get_status_trail(self):
61
64
  return self._status_trail
62
65
 
63
- def add_status(self, status: dict):
66
+ def add_status(self, status: CapsuleStatus):
64
67
  assert type(status) == dict, "TODO: Make this check somewhere else"
65
68
  self._status_trail.append({"timestamp": time.time(), "status": status})
66
69
 
@@ -107,13 +110,10 @@ class CapsuleStateMachine:
107
110
  def report_current_status(self, logger):
108
111
  pass
109
112
 
110
- def check_for_debug(self, state_dir: str):
111
- if CAPSULE_DEBUG:
112
- debug_path = os.path.join(
113
- state_dir, f"debug_capsule_{self._capsule_id}.json"
114
- )
115
- with open(debug_path, "w") as f:
116
- json.dump(self._status_trail, f, indent=4)
113
+ def save_debug_info(self, state_dir: str):
114
+ debug_path = os.path.join(state_dir, f"debug_capsule_{self._capsule_id}.json")
115
+ with open(debug_path, "w") as f:
116
+ json.dump(self._status_trail, f, indent=4)
117
117
 
118
118
 
119
119
  class CapsuleWorkersStateMachine:
@@ -157,19 +157,18 @@ class CapsuleWorkersStateMachine:
157
157
  {"timestamp": time.time(), "status": worker_list_response}
158
158
  )
159
159
 
160
- def check_for_debug(self, state_dir: str):
161
- if CAPSULE_DEBUG:
162
- debug_path = os.path.join(
163
- state_dir, f"debug_capsule_workers_{self._capsule_id}_trail.json"
164
- )
165
- with open(debug_path, "w") as f:
166
- json.dump(self._status_trail, f, indent=4)
160
+ def save_debug_info(self, state_dir: str):
161
+ debug_path = os.path.join(
162
+ state_dir, f"debug_capsule_workers_{self._capsule_id}_trail.json"
163
+ )
164
+ with open(debug_path, "w") as f:
165
+ json.dump(self._status_trail, f, indent=4)
167
166
 
168
- status_path = os.path.join(
169
- state_dir, f"debug_capsule_workers_{self._capsule_id}_status.json"
170
- )
171
- with open(status_path, "w") as f:
172
- json.dump(self.current_version_deployment_status(), f, indent=4)
167
+ status_path = os.path.join(
168
+ state_dir, f"debug_capsule_workers_{self._capsule_id}_status.json"
169
+ )
170
+ with open(status_path, "w") as f:
171
+ json.dump(self.current_version_deployment_status(), f, indent=4)
173
172
 
174
173
  def report_current_status(self, logger):
175
174
  if len(self._status_trail) == 0:
@@ -196,29 +195,7 @@ class CapsuleWorkersStateMachine:
196
195
  )
197
196
 
198
197
  @property
199
- def is_terminal_state(self):
200
- return any(
201
- s is not None for s in [self.is_success_state, self.is_failure_state]
202
- )
203
-
204
- @property
205
- def is_success_state(self):
206
- status = self.current_version_deployment_status()
207
- if self._deployment_mode == DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING:
208
- return status["status"]["at_least_one_running"]
209
- elif self._deployment_mode == DEPLOYMENT_READY_CONDITIONS.ALL_RUNNING:
210
- return status["status"]["all_running"]
211
- elif self._deployment_mode == DEPLOYMENT_READY_CONDITIONS.FULLY_FINISHED:
212
- return (
213
- status["status"]["current_info"]["running"] == self._minimum_replicas
214
- and status["status"]["current_info"]["pending"] == 0
215
- and status["status"]["current_info"]["crashlooping"] == 0
216
- )
217
- else:
218
- raise ValueError(f"Unknown deployment mode: {self._deployment_mode}")
219
-
220
- @property
221
- def is_failure_state(self):
198
+ def is_crashlooping(self) -> bool:
222
199
  status = self.current_version_deployment_status()
223
200
  return status["status"]["at_least_one_crashlooping"]
224
201
 
@@ -352,19 +329,24 @@ class CapsuleApiException(Exception):
352
329
 
353
330
 
354
331
  class CapsuleDeploymentException(Exception):
355
- def __init__(self, capsule_id: str, message: str):
332
+ def __init__(
333
+ self,
334
+ capsule_id: str,
335
+ message: str,
336
+ ):
356
337
  self.capsule_id = capsule_id
357
338
  self.message = message
358
339
 
359
340
  def __str__(self):
360
- return f"CapsuleDeploymentException: {self.capsule_id} \n\n {self.message}"
341
+ return f"CapsuleDeploymentException: [{self.capsule_id}] :: {self.message}"
361
342
 
362
343
 
363
344
  class CapsuleApi:
364
- def __init__(self, base_url: str, perimeter: str):
345
+ def __init__(self, base_url: str, perimeter: str, logger_fn=None):
365
346
  self._base_url = self._create_base_url(base_url, perimeter)
366
347
  from metaflow.metaflow_config import SERVICE_HEADERS
367
348
 
349
+ self._logger_fn = logger_fn
368
350
  self._request_headers = {
369
351
  **{"Content-Type": "application/json", "Connection": "keep-alive"},
370
352
  **(SERVICE_HEADERS or {}),
@@ -386,6 +368,7 @@ class CapsuleApi:
386
368
  method_func,
387
369
  *args,
388
370
  headers=self._request_headers,
371
+ logger_fn=self._logger_fn,
389
372
  **kwargs,
390
373
  )
391
374
  except MaximumRetriesExceeded as e:
@@ -600,10 +583,16 @@ class CapsuleDeployer:
600
583
  debug_dir: Optional[str] = None,
601
584
  success_terminal_state_condition: str = DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING,
602
585
  readiness_wait_time: int = 20,
586
+ logger_fn=None,
603
587
  ):
604
588
  self._app_config = app_config
605
- self._capsule_api = CapsuleApi(base_url, app_config.get_state("perimeter"))
589
+ self._capsule_api = CapsuleApi(
590
+ base_url,
591
+ app_config.get_state("perimeter"),
592
+ logger_fn=logger_fn or partial(print, file=sys.stderr),
593
+ )
606
594
  self._create_timeout = create_timeout
595
+ self._logger_fn = logger_fn
607
596
  self._debug_dir = debug_dir
608
597
  self._capsule_deploy_response = None
609
598
  self._success_terminal_state_condition = success_terminal_state_condition
@@ -669,14 +658,19 @@ class CapsuleDeployer:
669
658
  )
670
659
 
671
660
  def _monitor_worker_readiness(
672
- self, workers_sm: "CapsuleWorkersStateMachine", logger=print
661
+ self,
662
+ workers_sm: "CapsuleWorkersStateMachine",
673
663
  ):
674
- """ """
664
+ """returns True if the worker is crashlooping, False otherwise"""
665
+ logger = self._logger_fn or partial(print, file=sys.stderr)
675
666
  for i in range(self._readiness_wait_time):
676
667
  time.sleep(1)
677
668
  workers_response = self.get_workers()
678
669
  workers_sm.add_status(workers_response)
679
670
  workers_sm.report_current_status(logger)
671
+ if workers_sm.is_crashlooping:
672
+ return True
673
+ return False
680
674
 
681
675
  def _extract_logs_from_crashlooping_worker(
682
676
  self, workers_sm: "CapsuleWorkersStateMachine"
@@ -697,13 +691,14 @@ class CapsuleDeployer:
697
691
 
698
692
  def wait_for_terminal_state(
699
693
  self,
700
- logger=print,
701
694
  ):
702
695
  """ """
696
+ logger = self._logger_fn or partial(print, file=sys.stderr)
703
697
  state_machine = CapsuleStateMachine(
704
698
  self.identifier, self.current_deployment_instance_version
705
699
  )
706
- min_replicas = self._app_config.get_state("replicas", {}).get("min", 1)
700
+ # min_replicas will always be present
701
+ min_replicas = self._app_config.get_state("replicas", {}).get("min")
707
702
  workers_state_machine = CapsuleWorkersStateMachine(
708
703
  self.identifier,
709
704
  self.current_deployment_instance_version,
@@ -730,10 +725,22 @@ class CapsuleDeployer:
730
725
  # of the workerstate machine. If we detect a terminal state in the workers,
731
726
  # then even if the capsule upgrade is still in progress we will end up crashing
732
727
  # the deployment.
733
- if (
734
- not state_machine.update_in_progress
735
- and workers_state_machine.is_terminal_state
736
- ) or (workers_state_machine.is_failure_state):
728
+ (
729
+ capsule_ready,
730
+ further_check_worker_readiness,
731
+ ) = DEPLOYMENT_READY_CONDITIONS.check_readiness_condition(
732
+ state_machine.current_status,
733
+ workers_state_machine.current_version_deployment_status(),
734
+ self._success_terminal_state_condition,
735
+ )
736
+
737
+ failure_condition_satisfied = (
738
+ DEPLOYMENT_READY_CONDITIONS.check_failure_condition(
739
+ state_machine.current_status,
740
+ workers_state_machine.current_version_deployment_status(),
741
+ )
742
+ )
743
+ if capsule_ready or failure_condition_satisfied:
737
744
  logger(
738
745
  "💊 %s deployment status: %s | worker states: [success :%s | failure :%s ] "
739
746
  % (
@@ -741,19 +748,31 @@ class CapsuleDeployer:
741
748
  "in progress"
742
749
  if state_machine.update_in_progress
743
750
  else "completed",
744
- workers_state_machine.is_success_state,
745
- workers_state_machine.is_failure_state,
751
+ capsule_ready,
752
+ failure_condition_satisfied,
746
753
  )
747
754
  )
748
- if workers_state_machine.is_success_state:
755
+ _further_readiness_check_failed = False
756
+ if further_check_worker_readiness:
749
757
  # HACK : monitor the workers for N seconds to make sure they are healthy
750
758
  # this is a hack. Ideally we should implment a healtcheck as a first class citizen
751
759
  # but it will take some time to do that so in the meanwhile a timeout set on the cli
752
760
  # side will be really helpful.
753
- self._monitor_worker_readiness(workers_state_machine, logger)
761
+ logger(
762
+ "💊 running last minute readiness check for %s..."
763
+ % self.identifier
764
+ )
765
+ _further_readiness_check_failed = self._monitor_worker_readiness(
766
+ workers_state_machine
767
+ )
768
+
769
+ if CAPSULE_DEBUG:
770
+ logger(
771
+ f"[debug] 💊 {self.capsule_type} {self.identifier}: further_check_worker_readiness {_further_readiness_check_failed} | failure_condition_satisfied {failure_condition_satisfied}"
772
+ )
754
773
 
755
774
  # We should still check for failure state and crash if we detect something in the readiness check
756
- if workers_state_machine.is_failure_state:
775
+ if failure_condition_satisfied or _further_readiness_check_failed:
757
776
  # hit the logs endpoint for the worker and get the logs
758
777
  # Print those logs out on the terminal
759
778
  # raise an exception that should be caught gracefully by the cli
@@ -788,9 +807,13 @@ class CapsuleDeployer:
788
807
 
789
808
  break
790
809
 
791
- if self._debug_dir:
792
- state_machine.check_for_debug(self._debug_dir)
793
- workers_state_machine.check_for_debug(self._debug_dir)
810
+ if CAPSULE_DEBUG and self._debug_dir:
811
+ state_machine.save_debug_info(self._debug_dir)
812
+ workers_state_machine.save_debug_info(self._debug_dir)
813
+ if i % 3 == 0: # Every 3 seconds report the status
814
+ logger(
815
+ f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status: {state_machine.current_status} | worker states: {workers_state_machine.current_status}"
816
+ )
794
817
 
795
818
  if not self.status.ready_to_serve_traffic:
796
819
  raise CapsuleDeploymentException(
@@ -798,6 +821,13 @@ class CapsuleDeployer:
798
821
  f"Capsule {self.identifier} failed to be ready to serve traffic",
799
822
  )
800
823
 
824
+ if CAPSULE_DEBUG and self._debug_dir:
825
+ state_machine.save_debug_info(self._debug_dir)
826
+ workers_state_machine.save_debug_info(self._debug_dir)
827
+ logger(
828
+ f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status [on return]: {state_machine.current_status} | worker states: {workers_state_machine.current_status}"
829
+ )
830
+
801
831
  return dict(
802
832
  id=self.identifier,
803
833
  auth_type=self.capsule_type,
@@ -5,6 +5,11 @@ description: |
5
5
  How to read this schema:
6
6
  1. If the a property has `allow_union`:true then it will allow overrides from the cli.
7
7
  2. If a property has `experimental` set to true then a lot its validations may-be skipped and parsing handled somewhere else.
8
+
9
+ The YAML based schema file is for Humans to change and consume. The JSON based schema file is what gets autogenerated based on pre-commit
10
+ hooks so that we can use within the outerbounds package. The reasons for two distinct types of files it that YAML provides the ability to
11
+ add comments and make readability easier. While JSON is so that we can reduce the dependency on YAML when working with apps within both
12
+ Metaflow and OB package.
8
13
  version: 1.0.0
9
14
  type: object
10
15
  required:
@@ -0,0 +1,336 @@
1
+ {
2
+ "title": "Outerbounds App Configuration Schema",
3
+ "description": "Schema for defining Outerbounds Apps configuration. This schema is what we will end up using on the CLI/programmatic interface.\nHow to read this schema:\n 1. If the a property has `allow_union`:true then it will allow overrides from the cli.\n 2. If a property has `experimental` set to true then a lot its validations may-be skipped and parsing handled somewhere else.\n\nThe YAML based schema file is for Humans to change and consume. The JSON based schema file is what gets autogenerated based on pre-commit\nhooks so that we can use within the outerbounds package. The reasons for two distinct types of files it that YAML provides the ability to\nadd comments and make readability easier. While JSON is so that we can reduce the dependency on YAML when working with apps within both\nMetaflow and OB package.\n",
4
+ "version": "1.0.0",
5
+ "type": "object",
6
+ "required": [
7
+ "name",
8
+ "port"
9
+ ],
10
+ "properties": {
11
+ "name": {
12
+ "allow_union": true,
13
+ "type": "string",
14
+ "description": "The name of the app to deploy.",
15
+ "maxLength": 20,
16
+ "example": "myapp"
17
+ },
18
+ "port": {
19
+ "allow_union": false,
20
+ "type": "integer",
21
+ "description": "Port where the app is hosted. When deployed this will be port on which we will deploy the app.",
22
+ "minimum": 1,
23
+ "maximum": 65535,
24
+ "example": 8000
25
+ },
26
+ "tags": {
27
+ "allow_union": true,
28
+ "type": "array",
29
+ "description": "The tags of the app to deploy.",
30
+ "items": {
31
+ "type": "object"
32
+ },
33
+ "example": [
34
+ {
35
+ "foo": "bar"
36
+ },
37
+ {
38
+ "x": "y"
39
+ }
40
+ ]
41
+ },
42
+ "description": {
43
+ "allow_union": true,
44
+ "type": "string",
45
+ "description": "The description of the app to deploy.",
46
+ "example": "This is a description of my app."
47
+ },
48
+ "force_upgrade": {
49
+ "allow_union": true,
50
+ "type": "boolean",
51
+ "description": "Whether to force upgrade the app even if it is currently being upgraded.",
52
+ "example": true
53
+ },
54
+ "app_type": {
55
+ "allow_union": true,
56
+ "type": "string",
57
+ "description": "The User defined type of app to deploy. Its only used for bookkeeping purposes.",
58
+ "example": "MyCustomAgent"
59
+ },
60
+ "image": {
61
+ "allow_union": true,
62
+ "type": "string",
63
+ "description": "The Docker image to deploy with the App."
64
+ },
65
+ "secrets": {
66
+ "allow_union": true,
67
+ "type": "array",
68
+ "description": "Outerbounds integrations to attach to the app. You can use the value you set in the `@secrets` decorator in your code.",
69
+ "items": {
70
+ "type": "string"
71
+ },
72
+ "example": [
73
+ "hf-token"
74
+ ]
75
+ },
76
+ "environment": {
77
+ "allow_union": true,
78
+ "type": "object",
79
+ "description": "Environment variables to deploy with the App.",
80
+ "additionalProperties": {
81
+ "oneOf": [
82
+ {
83
+ "type": "string"
84
+ },
85
+ {
86
+ "type": "number"
87
+ },
88
+ {
89
+ "type": "boolean"
90
+ },
91
+ {
92
+ "type": "object"
93
+ },
94
+ {
95
+ "type": "array"
96
+ }
97
+ ]
98
+ },
99
+ "example": {
100
+ "DEBUG": true,
101
+ "DATABASE_CONFIG": {
102
+ "host": "localhost",
103
+ "port": 5432
104
+ },
105
+ "ALLOWED_ORIGINS": [
106
+ "http://localhost:3000",
107
+ "https://myapp.com"
108
+ ]
109
+ }
110
+ },
111
+ "dependencies": {
112
+ "allow_union": false,
113
+ "type": "object",
114
+ "description": "The dependencies to attach to the app. Only one of the properties can be specified.\n",
115
+ "properties": {
116
+ "from_requirements_file": {
117
+ "type": "string",
118
+ "description": "The path to the requirements.txt file to attach to the app.",
119
+ "example": "requirements.txt"
120
+ },
121
+ "from_pyproject_toml": {
122
+ "type": "string",
123
+ "description": "The path to the pyproject.toml file to attach to the app.",
124
+ "example": "pyproject.toml"
125
+ },
126
+ "python": {
127
+ "type": "string",
128
+ "description": "The Python version to use for the app.\n",
129
+ "example": "3.10"
130
+ },
131
+ "pypi": {
132
+ "type": "object",
133
+ "description": "A dictionary of pypi dependencies to attach to the app.\nThe key is the package name and the value is the version.\n",
134
+ "example": {
135
+ "numpy": "1.23.0",
136
+ "pandas": ""
137
+ }
138
+ },
139
+ "conda": {
140
+ "type": "object",
141
+ "description": "A dictionary of pypi dependencies to attach to the app.\nThe key is the package name and the value is the version.\n",
142
+ "example": {
143
+ "numpy": "1.23.0",
144
+ "pandas": ""
145
+ }
146
+ }
147
+ }
148
+ },
149
+ "package": {
150
+ "allow_union": false,
151
+ "type": "object",
152
+ "description": "Configurations associated with packaging the app.\n",
153
+ "properties": {
154
+ "src_path": {
155
+ "type": "string",
156
+ "description": "The path to the source code to deploy with the App.",
157
+ "example": "./"
158
+ },
159
+ "suffixes": {
160
+ "type": "array",
161
+ "description": "A list of suffixes to add to the source code to deploy with the App.\n",
162
+ "items": {
163
+ "type": "string"
164
+ },
165
+ "example": [
166
+ ".py",
167
+ ".ipynb"
168
+ ]
169
+ }
170
+ }
171
+ },
172
+ "commands": {
173
+ "allow_union": false,
174
+ "type": "array",
175
+ "description": "A list of commands to run the app with. Cannot be configured from the CLI. Only used in `run` command.",
176
+ "items": {
177
+ "type": "string"
178
+ },
179
+ "example": [
180
+ "python app.py",
181
+ "python app.py --foo bar"
182
+ ]
183
+ },
184
+ "resources": {
185
+ "allow_union": true,
186
+ "type": "object",
187
+ "properties": {
188
+ "cpu": {
189
+ "type": "string",
190
+ "description": "CPU resource request and limit.",
191
+ "example": "500m",
192
+ "default": "1"
193
+ },
194
+ "memory": {
195
+ "type": "string",
196
+ "description": "Memory resource request and limit.",
197
+ "example": "512Mi",
198
+ "default": "4Gi"
199
+ },
200
+ "gpu": {
201
+ "type": "string",
202
+ "description": "GPU resource request and limit.",
203
+ "example": "1"
204
+ },
205
+ "storage": {
206
+ "type": "string",
207
+ "description": "Storage resource request and limit.",
208
+ "example": "1Gi",
209
+ "default": "10Gi"
210
+ }
211
+ }
212
+ },
213
+ "replicas": {
214
+ "allow_union": true,
215
+ "type": "object",
216
+ "description": "The number of replicas to deploy the app with.\n",
217
+ "properties": {
218
+ "min": {
219
+ "type": "integer",
220
+ "description": "The minimum number of replicas to deploy the app with.",
221
+ "example": 1
222
+ },
223
+ "max": {
224
+ "type": "integer",
225
+ "description": "The maximum number of replicas to deploy the app with.",
226
+ "example": 10
227
+ }
228
+ }
229
+ },
230
+ "health_check": {
231
+ "type": "object",
232
+ "allow_union": false,
233
+ "properties": {
234
+ "enabled": {
235
+ "type": "boolean",
236
+ "description": "Whether to enable health checks.",
237
+ "example": true,
238
+ "default": false
239
+ },
240
+ "path": {
241
+ "type": "string",
242
+ "description": "The path for health checks.",
243
+ "example": "/health"
244
+ },
245
+ "initial_delay_seconds": {
246
+ "type": "integer",
247
+ "description": "Number of seconds to wait before performing the first health check.",
248
+ "example": 10
249
+ },
250
+ "period_seconds": {
251
+ "type": "integer",
252
+ "description": "How often to perform the health check.",
253
+ "example": 30
254
+ }
255
+ }
256
+ },
257
+ "compute_pools": {
258
+ "allow_union": true,
259
+ "type": "array",
260
+ "description": "A list of compute pools to deploy the app to.\n",
261
+ "items": {
262
+ "type": "string"
263
+ },
264
+ "example": [
265
+ "default",
266
+ "large"
267
+ ]
268
+ },
269
+ "auth": {
270
+ "allow_union": false,
271
+ "type": "object",
272
+ "description": "Auth related configurations.\n",
273
+ "properties": {
274
+ "type": {
275
+ "type": "string",
276
+ "description": "The type of authentication to use for the app.\n",
277
+ "enum": [
278
+ "API",
279
+ "Browser"
280
+ ]
281
+ },
282
+ "public": {
283
+ "type": "boolean",
284
+ "description": "Whether the app is public or not.\n",
285
+ "default": true
286
+ }
287
+ }
288
+ },
289
+ "project": {
290
+ "type": "string",
291
+ "description": "The project name to deploy the app to.",
292
+ "experimental": true,
293
+ "allow_union": true
294
+ },
295
+ "branch": {
296
+ "type": "string",
297
+ "description": "The branch name to deploy the app to.",
298
+ "experimental": true,
299
+ "allow_union": true
300
+ },
301
+ "models": {
302
+ "type": "array",
303
+ "description": "model asset ids to include with the deployment. NO CLI Option for this Now.",
304
+ "experimental": true,
305
+ "allow_union": true,
306
+ "items": {
307
+ "type": "object",
308
+ "properties": {
309
+ "asset_id": {
310
+ "type": "string"
311
+ },
312
+ "asset_instance_id": {
313
+ "type": "string"
314
+ }
315
+ }
316
+ }
317
+ },
318
+ "data": {
319
+ "type": "array",
320
+ "description": "data asset ids to include with the deployment.",
321
+ "experimental": true,
322
+ "allow_union": true,
323
+ "items": {
324
+ "type": "object",
325
+ "properties": {
326
+ "asset_id": {
327
+ "type": "string"
328
+ },
329
+ "asset_instance_id": {
330
+ "type": "string"
331
+ }
332
+ }
333
+ }
334
+ }
335
+ }
336
+ }
outerbounds/apps/utils.py CHANGED
@@ -17,6 +17,7 @@ import logging
17
17
  import itertools
18
18
  from typing import Union, Callable, Any, List
19
19
 
20
+ # TODO: remove vendor'd dependency where.
20
21
  from outerbounds._vendor.spinner import (
21
22
  Spinners,
22
23
  )
@@ -176,6 +177,7 @@ def safe_requests_wrapper(
176
177
  *args,
177
178
  conn_error_retries=2,
178
179
  retryable_status_codes=[409],
180
+ logger_fn=None,
179
181
  **kwargs,
180
182
  ):
181
183
  """
@@ -206,10 +208,15 @@ def safe_requests_wrapper(
206
208
  if response.status_code not in retryable_status_codes:
207
209
  return response
208
210
  if CAPSULE_DEBUG:
209
- print(
210
- f"[outerbounds-debug] safe_requests_wrapper: {response.url}[{requests_module_fn.__name__}] {response.status_code} {response.text}",
211
- file=sys.stderr,
212
- )
211
+ if logger_fn:
212
+ logger_fn(
213
+ f"[outerbounds-debug] safe_requests_wrapper: {response.url}[{requests_module_fn.__name__}] {response.status_code} {response.text}",
214
+ )
215
+ else:
216
+ print(
217
+ f"[outerbounds-debug] safe_requests_wrapper: {response.url}[{requests_module_fn.__name__}] {response.status_code} {response.text}",
218
+ file=sys.stderr,
219
+ )
213
220
  _num_retries += 1
214
221
  time.sleep((2 ** (_num_retries + 1)) + noise)
215
222
  except requests.exceptions.ConnectionError:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: outerbounds
3
- Version: 0.3.182rc2
3
+ Version: 0.3.183rc1
4
4
  Summary: More Data Science, Less Administration
5
5
  License: Proprietary
6
6
  Keywords: data science,machine learning,MLOps
@@ -28,9 +28,9 @@ Requires-Dist: google-auth (>=2.27.0,<3.0.0) ; extra == "gcp"
28
28
  Requires-Dist: google-cloud-secret-manager (>=2.20.0,<3.0.0) ; extra == "gcp"
29
29
  Requires-Dist: google-cloud-storage (>=2.14.0,<3.0.0) ; extra == "gcp"
30
30
  Requires-Dist: metaflow-checkpoint (==0.2.1)
31
- Requires-Dist: ob-metaflow (==2.15.17.1)
32
- Requires-Dist: ob-metaflow-extensions (==1.1.170rc2)
33
- Requires-Dist: ob-metaflow-stubs (==6.0.3.182rc2)
31
+ Requires-Dist: ob-metaflow (==2.15.18.1)
32
+ Requires-Dist: ob-metaflow-extensions (==1.1.171rc1)
33
+ Requires-Dist: ob-metaflow-stubs (==6.0.3.183rc1)
34
34
  Requires-Dist: opentelemetry-distro (>=0.41b0) ; extra == "otel"
35
35
  Requires-Dist: opentelemetry-exporter-otlp-proto-http (>=1.20.0) ; extra == "otel"
36
36
  Requires-Dist: opentelemetry-instrumentation-requests (>=0.41b0) ; extra == "otel"
@@ -43,23 +43,24 @@ outerbounds/_vendor/yaml/scanner.py,sha256=ZcI8IngR56PaQ0m27WU2vxCqmDCuRjz-hr7pi
43
43
  outerbounds/_vendor/yaml/serializer.py,sha256=8wFZRy9SsQSktF_f9OOroroqsh4qVUe53ry07P9UgCc,4368
44
44
  outerbounds/_vendor/yaml/tokens.py,sha256=JBSu38wihGr4l73JwbfMA7Ks1-X84g8-NskTz7KwPmA,2578
45
45
  outerbounds/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
- outerbounds/apps/_state_machine.py,sha256=eAJFNt_9kwE70XyXoMU107TrzPRnb2xcAyVtXV1yAGc,14646
47
- outerbounds/apps/app_cli.py,sha256=0TCSxkKz9psJMGULMJcqXvrEJNibBDUTVoY37N5pcIk,51003
48
- outerbounds/apps/app_config.py,sha256=UHVK8JLIuW-OcGg5WxDm4QHeImPGtohD4KpJryZntC4,11307
46
+ outerbounds/apps/_state_machine.py,sha256=uv3tGOWxqY0NdQdsjg3uvPpBu0JeniE41AE9_H7El5c,18601
47
+ outerbounds/apps/app_cli.py,sha256=bblWHLC9h6VCaXYnVndrrpmcuahapbsR9NzeU4rX84I,52107
48
+ outerbounds/apps/app_config.py,sha256=ixxeOlZD0AB2TFoN-mzeqEqduGriTSCsdFFm23bVpqw,11382
49
49
  outerbounds/apps/artifacts.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
- outerbounds/apps/capsule.py,sha256=8ZoEjLwcUvbps6H03hkl25cAGj8pyaVNziaAK8AZTxY,31523
50
+ outerbounds/apps/capsule.py,sha256=NC9ajD06y6U-COi-8Qw6k_N1ltbQAio2O_Xs2RTrAVA,32857
51
51
  outerbounds/apps/cli_to_config.py,sha256=Thc5jXRxoU6Pr8kAVVOX-5Es5ha6y6Vh_GBzL__oI7Q,3299
52
52
  outerbounds/apps/click_importer.py,sha256=nnkPOR6TKrtIpc3a5Fna1zVJoQqDZvUXlNA9CdiNKFc,995
53
53
  outerbounds/apps/code_package/__init__.py,sha256=8McF7pgx8ghvjRnazp2Qktlxi9yYwNiwESSQrk-2oW8,68
54
54
  outerbounds/apps/code_package/code_packager.py,sha256=RWvM5BKjgLhu7icsO_n5SSYC57dwyST0dWpoWF88ovU,22881
55
55
  outerbounds/apps/code_package/examples.py,sha256=aF8qKIJxCVv_ugcShQjqUsXKKKMsm1oMkQIl8w3QKuw,4016
56
- outerbounds/apps/config_schema.yaml,sha256=j_mysTAPkIMSocItTg3aduMDfBs2teIhAErvpF0Elus,8826
56
+ outerbounds/apps/config_schema.yaml,sha256=85g2khmkZg5pduilcOXMgn5m2I3DF_2WrRMjo5WsojM,9272
57
+ outerbounds/apps/config_schema_autogen.json,sha256=Mu9y6n-ophJ-48M9kBU4_tL6I9Y5nl06lMyufGMfOF8,9855
57
58
  outerbounds/apps/dependencies.py,sha256=03pZY-JRN-dYN-iyZ73zoEIEKmrOvbY4qld7RlRXYuw,3976
58
59
  outerbounds/apps/deployer.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
60
  outerbounds/apps/experimental/__init__.py,sha256=RUZBAyqFnX3pRQxTjNmS1-qpgQcc9xQGQD2yJh4MA_M,3349
60
61
  outerbounds/apps/perimeters.py,sha256=1J1_-5legFPskv3HTRwQMpzTytE3TO8KRT2IvVOrWcQ,1584
61
62
  outerbounds/apps/secrets.py,sha256=aWzcAayQEJghQgFP_qp9w6jyvan_hoL4_ceqZ0ZjLd4,6126
62
- outerbounds/apps/utils.py,sha256=6REvD9PtJcLYzrxX5lZ5Dzzm-Sy2l-I1oSzQN9viYRs,7611
63
+ outerbounds/apps/utils.py,sha256=Yvoj1NFoIBqqaWQ32530saCb66B_juSiDOanH0CJJDE,7950
63
64
  outerbounds/apps/validations.py,sha256=kR2eXckx0XJ4kUOOLkMRepbTh0INtL1Z8aV4-fZpfc8,678
64
65
  outerbounds/cli_main.py,sha256=e9UMnPysmc7gbrimq2I4KfltggyU7pw59Cn9aEguVcU,74
65
66
  outerbounds/command_groups/__init__.py,sha256=QPWtj5wDRTINDxVUL7XPqG3HoxHNvYOg08EnuSZB2Hc,21
@@ -78,7 +79,7 @@ outerbounds/utils/metaflowconfig.py,sha256=l2vJbgPkLISU-XPGZFaC8ZKmYFyJemlD6bwB-
78
79
  outerbounds/utils/schema.py,sha256=lMUr9kNgn9wy-sO_t_Tlxmbt63yLeN4b0xQXbDUDj4A,2331
79
80
  outerbounds/utils/utils.py,sha256=4Z8cszNob_8kDYCLNTrP-wWads_S_MdL3Uj3ju4mEsk,501
80
81
  outerbounds/vendor.py,sha256=gRLRJNXtZBeUpPEog0LOeIsl6GosaFFbCxUvR4bW6IQ,5093
81
- outerbounds-0.3.182rc2.dist-info/METADATA,sha256=QNhNeHMNuCMReA27whbsen3OPIniLUinbQ3ScvLfQLc,1846
82
- outerbounds-0.3.182rc2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
83
- outerbounds-0.3.182rc2.dist-info/entry_points.txt,sha256=AP6rZg7y5SK9e9a9iVq0Fi9Q2KPjPZSwtZ6R98rLw-8,56
84
- outerbounds-0.3.182rc2.dist-info/RECORD,,
82
+ outerbounds-0.3.183rc1.dist-info/METADATA,sha256=Szdy_WqZjvdilRX2b-KlnLVKg6T3bGlmGCOQzlHNyRw,1846
83
+ outerbounds-0.3.183rc1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
84
+ outerbounds-0.3.183rc1.dist-info/entry_points.txt,sha256=AP6rZg7y5SK9e9a9iVq0Fi9Q2KPjPZSwtZ6R98rLw-8,56
85
+ outerbounds-0.3.183rc1.dist-info/RECORD,,