outerbounds 0.3.183rc1__py3-none-any.whl → 0.3.185__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,839 +0,0 @@
1
- from datetime import datetime
2
- import json
3
- import os
4
- import pathlib
5
- import requests
6
- import sys
7
- import time
8
- from functools import partial
9
- import shlex
10
- from typing import Optional, List, Dict, Any, Tuple, Union
11
- from .utils import TODOException, safe_requests_wrapper, MaximumRetriesExceeded
12
- from .app_config import AppConfig, CAPSULE_DEBUG, AuthType
13
- from . import experimental
14
- from ._state_machine import (
15
- _capsule_worker_semantic_status,
16
- _capsule_worker_status_diff,
17
- CapsuleWorkerSemanticStatus,
18
- WorkerStatus,
19
- CapsuleStatus,
20
- DEPLOYMENT_READY_CONDITIONS,
21
- )
22
-
23
-
24
- class CapsuleStateMachine:
25
- """
26
- - Every capsule create call will return a `identifier` and a `version` of the object.
27
- - Each update call will return a new version.
28
- - The status.currentlyServedVersion will be the version that is currently serving traffic.
29
- - The status.updateInProgress will be True if an upgrade is in progress.
30
-
31
- CapsuleState Transition:
32
- - Every capsule create call will return a `identifier` and a `version` of the object.
33
- - Happy Path:
34
- - First time Create :
35
- - wait for status.updateInProgress to be set to False
36
- - (interleved) Poll the worker endpoints to check their status
37
- - showcase how many workers are coming up if things are on the cli side.
38
- - If the user has set some flag like `--dont-wait-to-fully-finish` then we check the `status.currentlyServedVersion` to see if even one replica is ready to
39
- serve traffic.
40
- - once the status.updateInProgress is set to False, it means that the replicas are ready
41
- - Upgrade:
42
- - wait for status.updateInProgress to be set to False
43
- - (interleved) Poll the worker endpoints to check their status and signal the user the number replicas coming up
44
- - If the user has set some flag like `--dont-wait-to-fully-finish` then we check the `status.currentlyServedVersion` to see if even one replica is ready to
45
- serve traffic.
46
- - Unhappy Path:
47
- - First time Create :
48
- - wait for status.updateInProgress to be set to False,
49
- - (interleved) Poll the workers to check their status.
50
- - If the worker pertaining the current deployment instance version is crashlooping then crash the deployment process with the error messages and logs.
51
- - Upgrade:
52
- - wait for status.updateInProgress to be set to False,
53
- - (interleved) Poll the workers to check their status.
54
- - If the worker pertaining the current deployment instance version is crashlooping then crash the deployment process with the error messages and logs.
55
-
56
- """
57
-
58
- def __init__(self, capsule_id: str, current_deployment_instance_version: str):
59
- self._capsule_id = capsule_id
60
- self._status_trail: List[Dict[str, Any]] = []
61
- self._current_deployment_instance_version = current_deployment_instance_version
62
-
63
- def get_status_trail(self):
64
- return self._status_trail
65
-
66
- def add_status(self, status: CapsuleStatus):
67
- assert type(status) == dict, "TODO: Make this check somewhere else"
68
- self._status_trail.append({"timestamp": time.time(), "status": status})
69
-
70
- @property
71
- def current_status(self):
72
- return self._status_trail[-1].get("status")
73
-
74
- @property
75
- def out_of_cluster_url(self):
76
- access_info = self.current_status.get("accessInfo", {}) or {}
77
- url = access_info.get("outOfClusterURL", None)
78
- if url is not None:
79
- return f"https://{url}"
80
- return None
81
-
82
- @property
83
- def in_cluster_url(self):
84
- access_info = self.current_status.get("accessInfo", {}) or {}
85
- url = access_info.get("inClusterURL", None)
86
- if url is not None:
87
- return f"https://{url}"
88
- return None
89
-
90
- @property
91
- def update_in_progress(self):
92
- return self.current_status.get("updateInProgress", False)
93
-
94
- @property
95
- def currently_served_version(self):
96
- return self.current_status.get("currentlyServedVersion", None)
97
-
98
- @property
99
- def ready_to_serve_traffic(self):
100
- if self.current_status.get("readyToServeTraffic", False):
101
- return any(
102
- i is not None for i in [self.out_of_cluster_url, self.in_cluster_url]
103
- )
104
- return False
105
-
106
- @property
107
- def available_replicas(self):
108
- return self.current_status.get("availableReplicas", 0)
109
-
110
- def report_current_status(self, logger):
111
- pass
112
-
113
- def save_debug_info(self, state_dir: str):
114
- debug_path = os.path.join(state_dir, f"debug_capsule_{self._capsule_id}.json")
115
- with open(debug_path, "w") as f:
116
- json.dump(self._status_trail, f, indent=4)
117
-
118
-
119
- class CapsuleWorkersStateMachine:
120
- def __init__(
121
- self,
122
- capsule_id: str,
123
- end_state_capsule_version: str,
124
- deployment_mode: str = DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING,
125
- minimum_replicas: int = 1,
126
- ):
127
- self._capsule_id = capsule_id
128
- self._end_state_capsule_version = end_state_capsule_version
129
- self._deployment_mode = deployment_mode
130
- self._minimum_replicas = minimum_replicas
131
- self._status_trail: List[Dict[str, Union[float, List[WorkerStatus]]]] = []
132
-
133
- def get_status_trail(self):
134
- return self._status_trail
135
-
136
- def add_status(self, worker_list_response: List[WorkerStatus]):
137
- """
138
- worker_list_response: List[Dict[str, Any]]
139
- [
140
- {
141
- "workerId": "c-4pqikm-659dd9ccdc-5hcwz",
142
- "phase": "Running",
143
- "activity": 0,
144
- "activityDataAvailable": false,
145
- "version": "0xhgaewiqb"
146
- },
147
- {
148
- "workerId": "c-4pqikm-b8559688b-xk2jh",
149
- "phase": "Pending",
150
- "activity": 0,
151
- "activityDataAvailable": false,
152
- "version": "421h48qh95"
153
- }
154
- ]
155
- """
156
- self._status_trail.append(
157
- {"timestamp": time.time(), "status": worker_list_response}
158
- )
159
-
160
- def save_debug_info(self, state_dir: str):
161
- debug_path = os.path.join(
162
- state_dir, f"debug_capsule_workers_{self._capsule_id}_trail.json"
163
- )
164
- with open(debug_path, "w") as f:
165
- json.dump(self._status_trail, f, indent=4)
166
-
167
- status_path = os.path.join(
168
- state_dir, f"debug_capsule_workers_{self._capsule_id}_status.json"
169
- )
170
- with open(status_path, "w") as f:
171
- json.dump(self.current_version_deployment_status(), f, indent=4)
172
-
173
- def report_current_status(self, logger):
174
- if len(self._status_trail) == 0:
175
- return
176
- older_status = None
177
- if len(self._status_trail) >= 2:
178
- older_status = _capsule_worker_semantic_status(
179
- self._status_trail[-2].get("status"),
180
- self._end_state_capsule_version,
181
- self._minimum_replicas,
182
- )
183
- current_status = self.current_version_deployment_status()
184
- changes = _capsule_worker_status_diff(current_status, older_status)
185
- if len(changes) > 0:
186
- logger(*changes)
187
-
188
- @property
189
- def current_status(self) -> List[WorkerStatus]:
190
- return self._status_trail[-1].get("status") # type: ignore
191
-
192
- def current_version_deployment_status(self) -> CapsuleWorkerSemanticStatus:
193
- return _capsule_worker_semantic_status(
194
- self.current_status, self._end_state_capsule_version, self._minimum_replicas
195
- )
196
-
197
- @property
198
- def is_crashlooping(self) -> bool:
199
- status = self.current_version_deployment_status()
200
- return status["status"]["at_least_one_crashlooping"]
201
-
202
-
203
- class CapsuleInput:
204
- @classmethod
205
- def construct_exec_command(cls, commands: list[str]):
206
- commands = ["set -eEuo pipefail"] + commands
207
- command_string = "\n".join(commands)
208
- # First constuct a base64 encoded string of the quoted command
209
- # One of the reasons we don't directly pass the command string to the backend with a `\n` join
210
- # is because the backend controller doesnt play nice when the command can be a multi-line string.
211
- # So we encode it to a base64 string and then decode it back to a command string at runtime to provide to
212
- # `bash -c`. The ideal thing to have done is to run "bash -c {shlex.quote(command_string)}" and call it a day
213
- # but the backend controller yields the following error:
214
- # `error parsing template: error converting YAML to JSON: yaml: line 111: mapping values are not allowed in this context`
215
- # So we go to great length to ensure the command is provided in base64 to avoid any issues with the backend controller.
216
- import base64
217
-
218
- encoded_command = base64.b64encode(command_string.encode()).decode()
219
- decode_cmd = f"echo {encoded_command} | base64 -d > ./_ob_app_run.sh"
220
- return (
221
- f"bash -c '{decode_cmd} && cat ./_ob_app_run.sh && bash ./_ob_app_run.sh'"
222
- )
223
-
224
- @classmethod
225
- def _marshal_environment_variables(cls, app_config: AppConfig):
226
- envs = app_config.get_state("environment", {}).copy()
227
- _return = []
228
- for k, v in envs.items():
229
- _v = v
230
- if isinstance(v, dict):
231
- _v = json.dumps(v)
232
- elif isinstance(v, list):
233
- _v = json.dumps(v)
234
- else:
235
- _v = str(v)
236
- _return.append(
237
- {
238
- "name": k,
239
- "value": _v,
240
- }
241
- )
242
- return _return
243
-
244
- @classmethod
245
- def from_app_config(self, app_config: AppConfig):
246
- gpu_resource = app_config.get_state("resources").get("gpu")
247
- resources = {}
248
- shared_memory = app_config.get_state("resources").get("shared_memory")
249
- if gpu_resource:
250
- resources["gpu"] = gpu_resource
251
- if shared_memory:
252
- resources["sharedMemory"] = shared_memory
253
-
254
- _scheduling_config = {}
255
- if app_config.get_state("compute_pools", None):
256
- _scheduling_config["schedulingConfig"] = {
257
- "computePools": [
258
- {"name": x} for x in app_config.get_state("compute_pools")
259
- ]
260
- }
261
- _description = app_config.get_state("description")
262
- _app_type = app_config.get_state("app_type")
263
- _final_info = {}
264
- if _description:
265
- _final_info["description"] = _description
266
- if _app_type:
267
- _final_info["endpointType"] = _app_type
268
- return {
269
- "perimeter": app_config.get_state("perimeter"),
270
- **_final_info,
271
- "codePackagePath": app_config.get_state("code_package_url"),
272
- "image": app_config.get_state("image"),
273
- "resourceIntegrations": [
274
- {"name": x} for x in app_config.get_state("secrets", [])
275
- ],
276
- "resourceConfig": {
277
- "cpu": str(app_config.get_state("resources").get("cpu")),
278
- "memory": str(app_config.get_state("resources").get("memory")),
279
- "ephemeralStorage": str(app_config.get_state("resources").get("disk")),
280
- **resources,
281
- },
282
- "autoscalingConfig": {
283
- "minReplicas": app_config.get_state("replicas", {}).get("min", 1),
284
- "maxReplicas": app_config.get_state("replicas", {}).get("max", 1),
285
- },
286
- **_scheduling_config,
287
- "containerStartupConfig": {
288
- "entrypoint": self.construct_exec_command(
289
- app_config.get_state("commands")
290
- )
291
- },
292
- "environmentVariables": self._marshal_environment_variables(app_config),
293
- # "assets": [{"name": "startup-script.sh"}],
294
- "authConfig": {
295
- "authType": app_config.get_state("auth").get("type"),
296
- "publicToDeployment": app_config.get_state("auth").get("public"),
297
- },
298
- "tags": [
299
- dict(key=k, value=v)
300
- for tag in app_config.get_state("tags", [])
301
- for k, v in tag.items()
302
- ],
303
- "port": app_config.get_state("port"),
304
- "displayName": app_config.get_state("name"),
305
- "forceUpdate": app_config.get_state("force_upgrade", False),
306
- }
307
-
308
-
309
- class CapsuleApiException(Exception):
310
- def __init__(
311
- self,
312
- url: str,
313
- method: str,
314
- status_code: int,
315
- text: str,
316
- message: Optional[str] = None,
317
- ):
318
- self.url = url
319
- self.method = method
320
- self.status_code = status_code
321
- self.text = text
322
- self.message = message
323
-
324
- def __str__(self):
325
- return (
326
- f"CapsuleApiException: {self.url} [{self.method}]: Status Code: {self.status_code} \n\n {self.text}"
327
- + (f"\n\n {self.message}" if self.message else "")
328
- )
329
-
330
-
331
- class CapsuleDeploymentException(Exception):
332
- def __init__(
333
- self,
334
- capsule_id: str,
335
- message: str,
336
- ):
337
- self.capsule_id = capsule_id
338
- self.message = message
339
-
340
- def __str__(self):
341
- return f"CapsuleDeploymentException: [{self.capsule_id}] :: {self.message}"
342
-
343
-
344
- class CapsuleApi:
345
- def __init__(self, base_url: str, perimeter: str, logger_fn=None):
346
- self._base_url = self._create_base_url(base_url, perimeter)
347
- from metaflow.metaflow_config import SERVICE_HEADERS
348
-
349
- self._logger_fn = logger_fn
350
- self._request_headers = {
351
- **{"Content-Type": "application/json", "Connection": "keep-alive"},
352
- **(SERVICE_HEADERS or {}),
353
- }
354
-
355
- @staticmethod
356
- def _create_base_url(base_url: str, perimeter: str):
357
- return os.path.join(
358
- base_url,
359
- "v1",
360
- "perimeters",
361
- perimeter,
362
- "capsules",
363
- )
364
-
365
- def _wrapped_api_caller(self, method_func, *args, **kwargs):
366
- try:
367
- response = safe_requests_wrapper(
368
- method_func,
369
- *args,
370
- headers=self._request_headers,
371
- logger_fn=self._logger_fn,
372
- **kwargs,
373
- )
374
- except MaximumRetriesExceeded as e:
375
- raise CapsuleApiException(
376
- e.url,
377
- e.method,
378
- e.status_code,
379
- e.text,
380
- message=f"Maximum retries exceeded for {e.url} [{e.method}]",
381
- )
382
- if response.status_code >= 400:
383
- raise CapsuleApiException(
384
- args[0],
385
- method_func.__name__,
386
- response.status_code,
387
- response.text,
388
- )
389
- return response
390
-
391
- def create(self, capsule_input: dict):
392
- _data = json.dumps(capsule_input)
393
- response = self._wrapped_api_caller(
394
- requests.post,
395
- self._base_url,
396
- data=_data,
397
- )
398
- try:
399
- return response.json()
400
- except json.JSONDecodeError as e:
401
- raise CapsuleApiException(
402
- self._base_url,
403
- "post",
404
- response.status_code,
405
- response.text,
406
- message="Capsule JSON decode failed",
407
- )
408
-
409
- def get(self, capsule_id: str):
410
- _url = os.path.join(self._base_url, capsule_id)
411
- response = self._wrapped_api_caller(
412
- requests.get,
413
- _url,
414
- retryable_status_codes=[409, 404], # todo : verify me
415
- conn_error_retries=3,
416
- )
417
- try:
418
- return response.json()
419
- except json.JSONDecodeError as e:
420
- raise CapsuleApiException(
421
- _url,
422
- "get",
423
- response.status_code,
424
- response.text,
425
- message="Capsule JSON decode failed",
426
- )
427
-
428
- def list(self):
429
- response = self._wrapped_api_caller(
430
- requests.get,
431
- self._base_url,
432
- retryable_status_codes=[409], # todo : verify me
433
- conn_error_retries=3,
434
- )
435
- try:
436
- response_json = response.json()
437
- except json.JSONDecodeError as e:
438
- raise CapsuleApiException(
439
- self._base_url,
440
- "get",
441
- response.status_code,
442
- response.text,
443
- message="Capsule JSON decode failed",
444
- )
445
- if "capsules" not in response_json:
446
- raise CapsuleApiException(
447
- self._base_url,
448
- "get",
449
- response.status_code,
450
- response.text,
451
- message="Capsule JSON decode failed",
452
- )
453
- return response_json.get("capsules", []) or []
454
-
455
- def delete(self, capsule_id: str):
456
- _url = os.path.join(self._base_url, capsule_id)
457
- response = self._wrapped_api_caller(
458
- requests.delete,
459
- _url,
460
- retryable_status_codes=[409], # todo : verify me
461
- )
462
- if response.status_code >= 400:
463
- raise CapsuleApiException(
464
- _url,
465
- "delete",
466
- response.status_code,
467
- response.text,
468
- )
469
-
470
- if response.status_code == 200:
471
- return True
472
- return False
473
-
474
- def get_workers(self, capsule_id: str) -> List[Dict[str, Any]]:
475
- _url = os.path.join(self._base_url, capsule_id, "workers")
476
- response = self._wrapped_api_caller(
477
- requests.get,
478
- _url,
479
- retryable_status_codes=[409, 404], # todo : verify me
480
- # Adding 404s because sometimes we might even end up getting 404s if
481
- # the backend cache is not updated yet. So on consistent 404s we should
482
- # just crash out.
483
- conn_error_retries=3,
484
- )
485
- try:
486
- return response.json().get("workers", []) or []
487
- except json.JSONDecodeError as e:
488
- raise CapsuleApiException(
489
- _url,
490
- "get",
491
- response.status_code,
492
- response.text,
493
- message="Capsule JSON decode failed",
494
- )
495
-
496
- def logs(
497
- self, capsule_id: str, worker_id: str, previous: bool = False
498
- ) -> List[str]:
499
- _url = os.path.join(self._base_url, capsule_id, "workers", worker_id, "logs")
500
- options = None
501
- if previous:
502
- options = {"previous": True}
503
- response = self._wrapped_api_caller(
504
- requests.get,
505
- _url,
506
- retryable_status_codes=[409], # todo : verify me
507
- params=options,
508
- )
509
- try:
510
- return response.json().get("logs", []) or []
511
- except json.JSONDecodeError as e:
512
- raise CapsuleApiException(
513
- _url,
514
- "get",
515
- response.status_code,
516
- response.text,
517
- message="Capsule JSON decode failed",
518
- )
519
-
520
-
521
- def list_and_filter_capsules(
522
- capsule_api: CapsuleApi, project, branch, name, tags, auth_type, capsule_id
523
- ):
524
- capsules = capsule_api.list()
525
-
526
- def _tags_match(tags, key, value):
527
- for t in tags:
528
- if t["key"] == key and t["value"] == value:
529
- return True
530
- return False
531
-
532
- def _all_tags_match(tags, tags_to_match):
533
- return all([_tags_match(tags, t["key"], t["value"]) for t in tags_to_match])
534
-
535
- def _filter_capsules(capsules, project, branch, name, tags, auth_type, capsule_id):
536
- _filtered_capsules = []
537
- for capsule in capsules:
538
- set_tags = capsule.get("spec", {}).get("tags", [])
539
- display_name = capsule.get("spec", {}).get("displayName", None)
540
- set_id = capsule.get("id", None)
541
- set_auth_type = (
542
- capsule.get("spec", {}).get("authConfig", {}).get("authType", None)
543
- )
544
-
545
- if auth_type and set_auth_type != auth_type:
546
- continue
547
- if project and not _tags_match(set_tags, "project", project):
548
- continue
549
- if branch and not _tags_match(set_tags, "branch", branch):
550
- continue
551
- if name and display_name != name:
552
- continue
553
- if tags and not _all_tags_match(set_tags, tags):
554
- continue
555
- if capsule_id and set_id != capsule_id:
556
- continue
557
-
558
- _filtered_capsules.append(capsule)
559
- return _filtered_capsules
560
-
561
- return _filter_capsules(
562
- capsules, project, branch, name, tags, auth_type, capsule_id
563
- )
564
-
565
-
566
- from collections import namedtuple
567
-
568
- CapsuleInfo = namedtuple("CapsuleInfo", ["info", "workers"])
569
-
570
-
571
- class CapsuleDeployer:
572
-
573
- status: CapsuleStateMachine
574
-
575
- identifier = None
576
-
577
- # TODO: Current default timeout is very large of 5 minutes. Ideally we should have finished the deployed in less than 1 minutes.
578
- def __init__(
579
- self,
580
- app_config: AppConfig,
581
- base_url: str,
582
- create_timeout: int = 60 * 5,
583
- debug_dir: Optional[str] = None,
584
- success_terminal_state_condition: str = DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING,
585
- readiness_wait_time: int = 20,
586
- logger_fn=None,
587
- ):
588
- self._app_config = app_config
589
- self._capsule_api = CapsuleApi(
590
- base_url,
591
- app_config.get_state("perimeter"),
592
- logger_fn=logger_fn or partial(print, file=sys.stderr),
593
- )
594
- self._create_timeout = create_timeout
595
- self._logger_fn = logger_fn
596
- self._debug_dir = debug_dir
597
- self._capsule_deploy_response = None
598
- self._success_terminal_state_condition = success_terminal_state_condition
599
- self._readiness_wait_time = readiness_wait_time
600
-
601
- @property
602
- def capsule_api(self):
603
- return self._capsule_api
604
-
605
- @property
606
- def capsule_type(self):
607
- auth_type = self._app_config.get_state("auth", {}).get("type", AuthType.default)
608
- if auth_type == AuthType.BROWSER:
609
- return "App"
610
- elif auth_type == AuthType.API:
611
- return "Endpoint"
612
- else:
613
- raise TODOException(f"Unknown auth type: {auth_type}")
614
-
615
- @property
616
- def name(self):
617
- return self._app_config.get_state("name")
618
-
619
- def create_input(self):
620
- return experimental.capsule_input_overrides(
621
- self._app_config, CapsuleInput.from_app_config(self._app_config)
622
- )
623
-
624
- @property
625
- def current_deployment_instance_version(self):
626
- """
627
- The backend `create` call returns a version of the object that will be
628
- """
629
- if self._capsule_deploy_response is None:
630
- return None
631
- return self._capsule_deploy_response.get("version", None)
632
-
633
- def create(self):
634
- capsule_response = self._capsule_api.create(self.create_input())
635
- self.identifier = capsule_response.get("id")
636
- self._capsule_deploy_response = capsule_response
637
- return self.identifier
638
-
639
- def get(self):
640
- return self._capsule_api.get(self.identifier)
641
-
642
- def get_workers(self):
643
- return self._capsule_api.get_workers(self.identifier)
644
-
645
- def _backend_version_mismatch_check(
646
- self, capsule_response: dict, current_deployment_instance_version: str
647
- ):
648
- """
649
- - `capsule_response.version` contains the version of the object present in the database
650
- - `current_deployment_instance_version` contains the version of the object that was deployed by this instance of the deployer.
651
- In the situtation that the versions of the objects become a mismatch then it means that current deployment process is not giving the user the
652
- output that they desire.
653
- """
654
- if capsule_response.get("version", None) != current_deployment_instance_version:
655
- raise CapsuleDeploymentException(
656
- self.identifier, # type: ignore
657
- f"A capsule upgrade was triggered outside current deployment instance. Current deployment version was discarded. Current deployment version: {current_deployment_instance_version} and new version: {capsule_response.get('version', None)}",
658
- )
659
-
660
- def _monitor_worker_readiness(
661
- self,
662
- workers_sm: "CapsuleWorkersStateMachine",
663
- ):
664
- """returns True if the worker is crashlooping, False otherwise"""
665
- logger = self._logger_fn or partial(print, file=sys.stderr)
666
- for i in range(self._readiness_wait_time):
667
- time.sleep(1)
668
- workers_response = self.get_workers()
669
- workers_sm.add_status(workers_response)
670
- workers_sm.report_current_status(logger)
671
- if workers_sm.is_crashlooping:
672
- return True
673
- return False
674
-
675
- def _extract_logs_from_crashlooping_worker(
676
- self, workers_sm: "CapsuleWorkersStateMachine"
677
- ):
678
- def _extract_worker_id_of_crashlooping_worker(
679
- workers_status: List[WorkerStatus],
680
- ):
681
- for worker in workers_status:
682
- if worker["phase"] == "CrashLoopBackOff":
683
- return worker["workerId"]
684
- return None
685
-
686
- worker_id = _extract_worker_id_of_crashlooping_worker(workers_sm.current_status)
687
- if worker_id is None:
688
- return None, None
689
- logs = self.capsule_api.logs(self.identifier, worker_id, previous=True)
690
- return logs, worker_id
691
-
692
- def wait_for_terminal_state(
693
- self,
694
- ):
695
- """ """
696
- logger = self._logger_fn or partial(print, file=sys.stderr)
697
- state_machine = CapsuleStateMachine(
698
- self.identifier, self.current_deployment_instance_version
699
- )
700
- # min_replicas will always be present
701
- min_replicas = self._app_config.get_state("replicas", {}).get("min")
702
- workers_state_machine = CapsuleWorkersStateMachine(
703
- self.identifier,
704
- self.current_deployment_instance_version,
705
- deployment_mode=self._success_terminal_state_condition,
706
- minimum_replicas=min_replicas,
707
- )
708
- self.status = state_machine
709
- for i in range(self._create_timeout):
710
- time.sleep(1)
711
- capsule_response = self.get()
712
- workers_response = self.get_workers()
713
-
714
- # We first need to check if someone has not upgraded the capsule under the hood and
715
- # the current deployment instance is invalid.
716
- self._backend_version_mismatch_check(
717
- capsule_response, self.current_deployment_instance_version
718
- )
719
- state_machine.add_status(capsule_response.get("status", {}))
720
- workers_state_machine.add_status(workers_response)
721
- state_machine.report_current_status(logger)
722
-
723
- workers_state_machine.report_current_status(logger)
724
- # Deployment readiness checks will determine what is the terminal state
725
- # of the workerstate machine. If we detect a terminal state in the workers,
726
- # then even if the capsule upgrade is still in progress we will end up crashing
727
- # the deployment.
728
- (
729
- capsule_ready,
730
- further_check_worker_readiness,
731
- ) = DEPLOYMENT_READY_CONDITIONS.check_readiness_condition(
732
- state_machine.current_status,
733
- workers_state_machine.current_version_deployment_status(),
734
- self._success_terminal_state_condition,
735
- )
736
-
737
- failure_condition_satisfied = (
738
- DEPLOYMENT_READY_CONDITIONS.check_failure_condition(
739
- state_machine.current_status,
740
- workers_state_machine.current_version_deployment_status(),
741
- )
742
- )
743
- if capsule_ready or failure_condition_satisfied:
744
- logger(
745
- "💊 %s deployment status: %s | worker states: [success :%s | failure :%s ] "
746
- % (
747
- self.capsule_type.title(),
748
- "in progress"
749
- if state_machine.update_in_progress
750
- else "completed",
751
- capsule_ready,
752
- failure_condition_satisfied,
753
- )
754
- )
755
- _further_readiness_check_failed = False
756
- if further_check_worker_readiness:
757
- # HACK : monitor the workers for N seconds to make sure they are healthy
758
- # this is a hack. Ideally we should implment a healtcheck as a first class citizen
759
- # but it will take some time to do that so in the meanwhile a timeout set on the cli
760
- # side will be really helpful.
761
- logger(
762
- "💊 running last minute readiness check for %s..."
763
- % self.identifier
764
- )
765
- _further_readiness_check_failed = self._monitor_worker_readiness(
766
- workers_state_machine
767
- )
768
-
769
- if CAPSULE_DEBUG:
770
- logger(
771
- f"[debug] 💊 {self.capsule_type} {self.identifier}: further_check_worker_readiness {_further_readiness_check_failed} | failure_condition_satisfied {failure_condition_satisfied}"
772
- )
773
-
774
- # We should still check for failure state and crash if we detect something in the readiness check
775
- if failure_condition_satisfied or _further_readiness_check_failed:
776
- # hit the logs endpoint for the worker and get the logs
777
- # Print those logs out on the terminal
778
- # raise an exception that should be caught gracefully by the cli
779
- logs, worker_id = self._extract_logs_from_crashlooping_worker(
780
- workers_state_machine
781
- )
782
- if logs is not None:
783
- # todo: It would be really odd if the logs are not present and we discover something is crashlooping.
784
- # Handle that condition later
785
- logger(
786
- *(
787
- [
788
- f"💥 Worker ID ({worker_id}) is crashlooping. Please check the following logs for more information: "
789
- ]
790
- + ["\t" + l["message"] for l in logs]
791
- )
792
- )
793
- raise CapsuleDeploymentException(
794
- self.identifier,
795
- f"Worker ID ({worker_id}) is crashlooping. Please check the logs for more information.",
796
- )
797
-
798
- if state_machine.ready_to_serve_traffic:
799
- logger(
800
- "💊 %s %s is ready to serve traffic on the URL: %s"
801
- % (
802
- self.capsule_type,
803
- self.identifier,
804
- state_machine.out_of_cluster_url,
805
- ),
806
- )
807
-
808
- break
809
-
810
- if CAPSULE_DEBUG and self._debug_dir:
811
- state_machine.save_debug_info(self._debug_dir)
812
- workers_state_machine.save_debug_info(self._debug_dir)
813
- if i % 3 == 0: # Every 3 seconds report the status
814
- logger(
815
- f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status: {state_machine.current_status} | worker states: {workers_state_machine.current_status}"
816
- )
817
-
818
- if not self.status.ready_to_serve_traffic:
819
- raise CapsuleDeploymentException(
820
- self.identifier,
821
- f"Capsule {self.identifier} failed to be ready to serve traffic",
822
- )
823
-
824
- if CAPSULE_DEBUG and self._debug_dir:
825
- state_machine.save_debug_info(self._debug_dir)
826
- workers_state_machine.save_debug_info(self._debug_dir)
827
- logger(
828
- f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status [on return]: {state_machine.current_status} | worker states: {workers_state_machine.current_status}"
829
- )
830
-
831
- return dict(
832
- id=self.identifier,
833
- auth_type=self.capsule_type,
834
- public_url=self.status.out_of_cluster_url,
835
- available_replicas=self.status.available_replicas,
836
- name=self.name,
837
- deployed_version=self.current_deployment_instance_version,
838
- deployed_at=datetime.now().isoformat(),
839
- )