ob-metaflow-extensions 1.1.174__py2.py3-none-any.whl → 1.1.175rc0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (31) hide show
  1. metaflow_extensions/outerbounds/plugins/__init__.py +1 -0
  2. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +3 -0
  3. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +1 -0
  4. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +470 -0
  5. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  6. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  7. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/app_cli.py +1521 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +325 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +859 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/cli_to_config.py +99 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +610 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +269 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/config_schema_autogen.json +336 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +0 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +110 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +45 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +22 -0
  26. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  27. metaflow_extensions/outerbounds/toplevel/ob_internal.py +2 -0
  28. {ob_metaflow_extensions-1.1.174.dist-info → ob_metaflow_extensions-1.1.175rc0.dist-info}/METADATA +1 -1
  29. {ob_metaflow_extensions-1.1.174.dist-info → ob_metaflow_extensions-1.1.175rc0.dist-info}/RECORD +31 -6
  30. {ob_metaflow_extensions-1.1.174.dist-info → ob_metaflow_extensions-1.1.175rc0.dist-info}/WHEEL +0 -0
  31. {ob_metaflow_extensions-1.1.174.dist-info → ob_metaflow_extensions-1.1.175rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,859 @@
1
+ from datetime import datetime
2
+ import json
3
+ import os
4
+ import pathlib
5
+ import requests
6
+ import sys
7
+ import time
8
+ from functools import partial
9
+ import shlex
10
+ from typing import Optional, List, Dict, Any, Tuple, Union
11
+ from .utils import TODOException, safe_requests_wrapper, MaximumRetriesExceeded
12
+ from .app_config import AppConfig, CAPSULE_DEBUG, AuthType
13
+ from . import experimental
14
+ from ._state_machine import (
15
+ _capsule_worker_semantic_status,
16
+ _capsule_worker_status_diff,
17
+ CapsuleWorkerSemanticStatus,
18
+ WorkerStatus,
19
+ CapsuleStatus,
20
+ DEPLOYMENT_READY_CONDITIONS,
21
+ )
22
+
23
+
24
+ def _format_url_string(url):
25
+ if url is None:
26
+ return None
27
+
28
+ if url.startswith("http://") or url.startswith("https://"):
29
+ return url
30
+
31
+ return f"https://{url}"
32
+
33
+
34
+ class CapsuleStateMachine:
35
+ """
36
+ - Every capsule create call will return a `identifier` and a `version` of the object.
37
+ - Each update call will return a new version.
38
+ - The status.currentlyServedVersion will be the version that is currently serving traffic.
39
+ - The status.updateInProgress will be True if an upgrade is in progress.
40
+
41
+ CapsuleState Transition:
42
+ - Every capsule create call will return a `identifier` and a `version` of the object.
43
+ - Happy Path:
44
+ - First time Create :
45
+ - wait for status.updateInProgress to be set to False
46
+ - (interleved) Poll the worker endpoints to check their status
47
+ - showcase how many workers are coming up if things are on the cli side.
48
+ - If the user has set some flag like `--dont-wait-to-fully-finish` then we check the `status.currentlyServedVersion` to see if even one replica is ready to
49
+ serve traffic.
50
+ - once the status.updateInProgress is set to False, it means that the replicas are ready
51
+ - Upgrade:
52
+ - wait for status.updateInProgress to be set to False
53
+ - (interleved) Poll the worker endpoints to check their status and signal the user the number replicas coming up
54
+ - If the user has set some flag like `--dont-wait-to-fully-finish` then we check the `status.currentlyServedVersion` to see if even one replica is ready to
55
+ serve traffic.
56
+ - Unhappy Path:
57
+ - First time Create :
58
+ - wait for status.updateInProgress to be set to False,
59
+ - (interleved) Poll the workers to check their status.
60
+ - If the worker pertaining the current deployment instance version is crashlooping then crash the deployment process with the error messages and logs.
61
+ - Upgrade:
62
+ - wait for status.updateInProgress to be set to False,
63
+ - (interleved) Poll the workers to check their status.
64
+ - If the worker pertaining the current deployment instance version is crashlooping then crash the deployment process with the error messages and logs.
65
+
66
+ """
67
+
68
+ def __init__(self, capsule_id: str, current_deployment_instance_version: str):
69
+ self._capsule_id = capsule_id
70
+ self._status_trail: List[Dict[str, Any]] = []
71
+ self._current_deployment_instance_version = current_deployment_instance_version
72
+
73
+ def get_status_trail(self):
74
+ return self._status_trail
75
+
76
+ def add_status(self, status: CapsuleStatus):
77
+ assert type(status) == dict, "TODO: Make this check somewhere else"
78
+ self._status_trail.append({"timestamp": time.time(), "status": status})
79
+
80
+ @property
81
+ def current_status(self):
82
+ return self._status_trail[-1].get("status")
83
+
84
+ @property
85
+ def out_of_cluster_url(self):
86
+ access_info = self.current_status.get("accessInfo", {}) or {}
87
+ return _format_url_string(access_info.get("outOfClusterURL", None))
88
+
89
+ @property
90
+ def in_cluster_url(self):
91
+ access_info = self.current_status.get("accessInfo", {}) or {}
92
+ return _format_url_string(access_info.get("inClusterURL", None))
93
+
94
+ @property
95
+ def update_in_progress(self):
96
+ return self.current_status.get("updateInProgress", False)
97
+
98
+ @property
99
+ def currently_served_version(self):
100
+ return self.current_status.get("currentlyServedVersion", None)
101
+
102
+ @property
103
+ def ready_to_serve_traffic(self):
104
+ if self.current_status.get("readyToServeTraffic", False):
105
+ return any(
106
+ i is not None for i in [self.out_of_cluster_url, self.in_cluster_url]
107
+ )
108
+ return False
109
+
110
+ @property
111
+ def available_replicas(self):
112
+ return self.current_status.get("availableReplicas", 0)
113
+
114
+ def report_current_status(self, logger):
115
+ pass
116
+
117
+ def save_debug_info(self, state_dir: str):
118
+ debug_path = os.path.join(state_dir, f"debug_capsule_{self._capsule_id}.json")
119
+ with open(debug_path, "w") as f:
120
+ json.dump(self._status_trail, f, indent=4)
121
+
122
+
123
+ class CapsuleWorkersStateMachine:
124
+ def __init__(
125
+ self,
126
+ capsule_id: str,
127
+ end_state_capsule_version: str,
128
+ deployment_mode: str = DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING,
129
+ minimum_replicas: int = 1,
130
+ ):
131
+ self._capsule_id = capsule_id
132
+ self._end_state_capsule_version = end_state_capsule_version
133
+ self._deployment_mode = deployment_mode
134
+ self._minimum_replicas = minimum_replicas
135
+ self._status_trail: List[Dict[str, Union[float, List[WorkerStatus]]]] = []
136
+
137
+ def get_status_trail(self):
138
+ return self._status_trail
139
+
140
+ def add_status(self, worker_list_response: List[WorkerStatus]):
141
+ """
142
+ worker_list_response: List[Dict[str, Any]]
143
+ [
144
+ {
145
+ "workerId": "c-4pqikm-659dd9ccdc-5hcwz",
146
+ "phase": "Running",
147
+ "activity": 0,
148
+ "activityDataAvailable": false,
149
+ "version": "0xhgaewiqb"
150
+ },
151
+ {
152
+ "workerId": "c-4pqikm-b8559688b-xk2jh",
153
+ "phase": "Pending",
154
+ "activity": 0,
155
+ "activityDataAvailable": false,
156
+ "version": "421h48qh95"
157
+ }
158
+ ]
159
+ """
160
+ self._status_trail.append(
161
+ {"timestamp": time.time(), "status": worker_list_response}
162
+ )
163
+
164
+ def save_debug_info(self, state_dir: str):
165
+ debug_path = os.path.join(
166
+ state_dir, f"debug_capsule_workers_{self._capsule_id}_trail.json"
167
+ )
168
+ with open(debug_path, "w") as f:
169
+ json.dump(self._status_trail, f, indent=4)
170
+
171
+ status_path = os.path.join(
172
+ state_dir, f"debug_capsule_workers_{self._capsule_id}_status.json"
173
+ )
174
+ with open(status_path, "w") as f:
175
+ json.dump(self.current_version_deployment_status(), f, indent=4)
176
+
177
+ def report_current_status(self, logger):
178
+ if len(self._status_trail) == 0:
179
+ return
180
+ older_status = None
181
+ if len(self._status_trail) >= 2:
182
+ older_status = _capsule_worker_semantic_status(
183
+ self._status_trail[-2].get("status"),
184
+ self._end_state_capsule_version,
185
+ self._minimum_replicas,
186
+ )
187
+ current_status = self.current_version_deployment_status()
188
+ changes = _capsule_worker_status_diff(current_status, older_status)
189
+ if len(changes) > 0:
190
+ logger(*changes)
191
+
192
+ @property
193
+ def current_status(self) -> List[WorkerStatus]:
194
+ return self._status_trail[-1].get("status") # type: ignore
195
+
196
+ def current_version_deployment_status(self) -> CapsuleWorkerSemanticStatus:
197
+ return _capsule_worker_semantic_status(
198
+ self.current_status, self._end_state_capsule_version, self._minimum_replicas
199
+ )
200
+
201
+ @property
202
+ def is_crashlooping(self) -> bool:
203
+ status = self.current_version_deployment_status()
204
+ return status["status"]["at_least_one_crashlooping"]
205
+
206
+
207
+ class CapsuleInput:
208
+ @classmethod
209
+ def construct_exec_command(cls, commands: list[str]):
210
+ commands = ["set -eEuo pipefail"] + commands
211
+ command_string = "\n".join(commands)
212
+ # First constuct a base64 encoded string of the quoted command
213
+ # One of the reasons we don't directly pass the command string to the backend with a `\n` join
214
+ # is because the backend controller doesnt play nice when the command can be a multi-line string.
215
+ # So we encode it to a base64 string and then decode it back to a command string at runtime to provide to
216
+ # `bash -c`. The ideal thing to have done is to run "bash -c {shlex.quote(command_string)}" and call it a day
217
+ # but the backend controller yields the following error:
218
+ # `error parsing template: error converting YAML to JSON: yaml: line 111: mapping values are not allowed in this context`
219
+ # So we go to great length to ensure the command is provided in base64 to avoid any issues with the backend controller.
220
+ import base64
221
+
222
+ encoded_command = base64.b64encode(command_string.encode()).decode()
223
+ decode_cmd = f"echo {encoded_command} | base64 -d > ./_ob_app_run.sh"
224
+ return (
225
+ f"bash -c '{decode_cmd} && cat ./_ob_app_run.sh && bash ./_ob_app_run.sh'"
226
+ )
227
+
228
+ @classmethod
229
+ def _marshal_environment_variables(cls, app_config: AppConfig):
230
+ envs = app_config.get_state("environment", {}).copy()
231
+ _return = []
232
+ for k, v in envs.items():
233
+ _v = v
234
+ if isinstance(v, dict):
235
+ _v = json.dumps(v)
236
+ elif isinstance(v, list):
237
+ _v = json.dumps(v)
238
+ else:
239
+ _v = str(v)
240
+ _return.append(
241
+ {
242
+ "name": k,
243
+ "value": _v,
244
+ }
245
+ )
246
+ return _return
247
+
248
+ @classmethod
249
+ def from_app_config(self, app_config: AppConfig):
250
+ gpu_resource = app_config.get_state("resources").get("gpu")
251
+ resources = {}
252
+ shared_memory = app_config.get_state("resources").get("shared_memory")
253
+ if gpu_resource:
254
+ resources["gpu"] = gpu_resource
255
+ if shared_memory:
256
+ resources["sharedMemory"] = shared_memory
257
+
258
+ _scheduling_config = {}
259
+ if app_config.get_state("compute_pools", None):
260
+ _scheduling_config["schedulingConfig"] = {
261
+ "computePools": [
262
+ {"name": x} for x in app_config.get_state("compute_pools")
263
+ ]
264
+ }
265
+ _description = app_config.get_state("description")
266
+ _app_type = app_config.get_state("app_type")
267
+ _final_info = {}
268
+ if _description:
269
+ _final_info["description"] = _description
270
+ if _app_type:
271
+ _final_info["endpointType"] = _app_type
272
+ return {
273
+ "perimeter": app_config.get_state("perimeter"),
274
+ **_final_info,
275
+ "codePackagePath": app_config.get_state("code_package_url"),
276
+ "image": app_config.get_state("image"),
277
+ "resourceIntegrations": [
278
+ {"name": x} for x in app_config.get_state("secrets", [])
279
+ ],
280
+ "resourceConfig": {
281
+ "cpu": str(app_config.get_state("resources").get("cpu")),
282
+ "memory": str(app_config.get_state("resources").get("memory")),
283
+ "ephemeralStorage": str(app_config.get_state("resources").get("disk")),
284
+ **resources,
285
+ },
286
+ "autoscalingConfig": {
287
+ "minReplicas": app_config.get_state("replicas", {}).get("min"),
288
+ "maxReplicas": app_config.get_state("replicas", {}).get("max"),
289
+ },
290
+ **_scheduling_config,
291
+ "containerStartupConfig": {
292
+ "entrypoint": self.construct_exec_command(
293
+ app_config.get_state("commands")
294
+ )
295
+ },
296
+ "environmentVariables": self._marshal_environment_variables(app_config),
297
+ # "assets": [{"name": "startup-script.sh"}],
298
+ "authConfig": {
299
+ "authType": app_config.get_state("auth").get("type"),
300
+ "publicToDeployment": app_config.get_state("auth").get("public"),
301
+ },
302
+ "tags": [
303
+ dict(key=k, value=v)
304
+ for tag in app_config.get_state("tags", [])
305
+ for k, v in tag.items()
306
+ ],
307
+ "port": app_config.get_state("port"),
308
+ "displayName": app_config.get_state("name"),
309
+ "forceUpdate": app_config.get_state("force_upgrade", False),
310
+ }
311
+
312
+
313
+ class CapsuleApiException(Exception):
314
+ def __init__(
315
+ self,
316
+ url: str,
317
+ method: str,
318
+ status_code: int,
319
+ text: str,
320
+ message: Optional[str] = None,
321
+ ):
322
+ self.url = url
323
+ self.method = method
324
+ self.status_code = status_code
325
+ self.text = text
326
+ self.message = message
327
+
328
+ def __str__(self):
329
+ return (
330
+ f"CapsuleApiException: {self.url} [{self.method}]: Status Code: {self.status_code} \n\n {self.text}"
331
+ + (f"\n\n {self.message}" if self.message else "")
332
+ )
333
+
334
+
335
+ class CapsuleDeploymentException(Exception):
336
+ def __init__(
337
+ self,
338
+ capsule_id: str,
339
+ message: str,
340
+ ):
341
+ self.capsule_id = capsule_id
342
+ self.message = message
343
+
344
+ def __str__(self):
345
+ return f"CapsuleDeploymentException: [{self.capsule_id}] :: {self.message}"
346
+
347
+
348
+ class CapsuleApi:
349
+ def __init__(self, base_url: str, perimeter: str, logger_fn=None):
350
+ self._base_url = self._create_base_url(base_url, perimeter)
351
+ from metaflow.metaflow_config import SERVICE_HEADERS
352
+
353
+ self._logger_fn = logger_fn
354
+ self._request_headers = {
355
+ **{"Content-Type": "application/json", "Connection": "keep-alive"},
356
+ **(SERVICE_HEADERS or {}),
357
+ }
358
+
359
+ @staticmethod
360
+ def _create_base_url(base_url: str, perimeter: str):
361
+ return os.path.join(
362
+ base_url,
363
+ "v1",
364
+ "perimeters",
365
+ perimeter,
366
+ "capsules",
367
+ )
368
+
369
+ def _wrapped_api_caller(self, method_func, *args, **kwargs):
370
+ try:
371
+ response = safe_requests_wrapper(
372
+ method_func,
373
+ *args,
374
+ headers=self._request_headers,
375
+ logger_fn=self._logger_fn,
376
+ **kwargs,
377
+ )
378
+ except MaximumRetriesExceeded as e:
379
+ raise CapsuleApiException(
380
+ e.url,
381
+ e.method,
382
+ e.status_code,
383
+ e.text,
384
+ message=f"Maximum retries exceeded for {e.url} [{e.method}]",
385
+ )
386
+ if response.status_code >= 400:
387
+ raise CapsuleApiException(
388
+ args[0],
389
+ method_func.__name__,
390
+ response.status_code,
391
+ response.text,
392
+ )
393
+ return response
394
+
395
+ def create(self, capsule_input: dict):
396
+ _data = json.dumps(capsule_input)
397
+ response = self._wrapped_api_caller(
398
+ requests.post,
399
+ self._base_url,
400
+ data=_data,
401
+ )
402
+ try:
403
+ return response.json()
404
+ except json.JSONDecodeError as e:
405
+ raise CapsuleApiException(
406
+ self._base_url,
407
+ "post",
408
+ response.status_code,
409
+ response.text,
410
+ message="Capsule JSON decode failed",
411
+ )
412
+
413
+ def get(self, capsule_id: str):
414
+ _url = os.path.join(self._base_url, capsule_id)
415
+ response = self._wrapped_api_caller(
416
+ requests.get,
417
+ _url,
418
+ retryable_status_codes=[409, 404], # todo : verify me
419
+ conn_error_retries=3,
420
+ )
421
+ try:
422
+ return response.json()
423
+ except json.JSONDecodeError as e:
424
+ raise CapsuleApiException(
425
+ _url,
426
+ "get",
427
+ response.status_code,
428
+ response.text,
429
+ message="Capsule JSON decode failed",
430
+ )
431
+
432
+ def list(self):
433
+ response = self._wrapped_api_caller(
434
+ requests.get,
435
+ self._base_url,
436
+ retryable_status_codes=[409], # todo : verify me
437
+ conn_error_retries=3,
438
+ )
439
+ try:
440
+ response_json = response.json()
441
+ except json.JSONDecodeError as e:
442
+ raise CapsuleApiException(
443
+ self._base_url,
444
+ "get",
445
+ response.status_code,
446
+ response.text,
447
+ message="Capsule JSON decode failed",
448
+ )
449
+ if "capsules" not in response_json:
450
+ raise CapsuleApiException(
451
+ self._base_url,
452
+ "get",
453
+ response.status_code,
454
+ response.text,
455
+ message="Capsule JSON decode failed",
456
+ )
457
+ return response_json.get("capsules", []) or []
458
+
459
+ def delete(self, capsule_id: str):
460
+ _url = os.path.join(self._base_url, capsule_id)
461
+ response = self._wrapped_api_caller(
462
+ requests.delete,
463
+ _url,
464
+ retryable_status_codes=[409], # todo : verify me
465
+ )
466
+ if response.status_code >= 400:
467
+ raise CapsuleApiException(
468
+ _url,
469
+ "delete",
470
+ response.status_code,
471
+ response.text,
472
+ )
473
+
474
+ if response.status_code == 200:
475
+ return True
476
+ return False
477
+
478
+ def get_workers(self, capsule_id: str) -> List[Dict[str, Any]]:
479
+ _url = os.path.join(self._base_url, capsule_id, "workers")
480
+ response = self._wrapped_api_caller(
481
+ requests.get,
482
+ _url,
483
+ retryable_status_codes=[409, 404], # todo : verify me
484
+ # Adding 404s because sometimes we might even end up getting 404s if
485
+ # the backend cache is not updated yet. So on consistent 404s we should
486
+ # just crash out.
487
+ conn_error_retries=3,
488
+ )
489
+ try:
490
+ return response.json().get("workers", []) or []
491
+ except json.JSONDecodeError as e:
492
+ raise CapsuleApiException(
493
+ _url,
494
+ "get",
495
+ response.status_code,
496
+ response.text,
497
+ message="Capsule JSON decode failed",
498
+ )
499
+
500
+ def logs(
501
+ self, capsule_id: str, worker_id: str, previous: bool = False
502
+ ) -> List[str]:
503
+ _url = os.path.join(self._base_url, capsule_id, "workers", worker_id, "logs")
504
+ options = None
505
+ if previous:
506
+ options = {"previous": True}
507
+ response = self._wrapped_api_caller(
508
+ requests.get,
509
+ _url,
510
+ retryable_status_codes=[409], # todo : verify me
511
+ params=options,
512
+ )
513
+ try:
514
+ return response.json().get("logs", []) or []
515
+ except json.JSONDecodeError as e:
516
+ raise CapsuleApiException(
517
+ _url,
518
+ "get",
519
+ response.status_code,
520
+ response.text,
521
+ message="Capsule JSON decode failed",
522
+ )
523
+
524
+
525
+ def list_and_filter_capsules(
526
+ capsule_api: CapsuleApi, project, branch, name, tags, auth_type, capsule_id
527
+ ):
528
+ capsules = capsule_api.list()
529
+
530
+ def _tags_match(tags, key, value):
531
+ for t in tags:
532
+ if t["key"] == key and t["value"] == value:
533
+ return True
534
+ return False
535
+
536
+ def _all_tags_match(tags, tags_to_match):
537
+ return all([_tags_match(tags, t["key"], t["value"]) for t in tags_to_match])
538
+
539
+ def _filter_capsules(capsules, project, branch, name, tags, auth_type, capsule_id):
540
+ _filtered_capsules = []
541
+ for capsule in capsules:
542
+ set_tags = capsule.get("spec", {}).get("tags", [])
543
+ display_name = capsule.get("spec", {}).get("displayName", None)
544
+ set_id = capsule.get("id", None)
545
+ set_auth_type = (
546
+ capsule.get("spec", {}).get("authConfig", {}).get("authType", None)
547
+ )
548
+
549
+ if auth_type and set_auth_type != auth_type:
550
+ continue
551
+ if project and not _tags_match(set_tags, "project", project):
552
+ continue
553
+ if branch and not _tags_match(set_tags, "branch", branch):
554
+ continue
555
+ if name and display_name != name:
556
+ continue
557
+ if tags and not _all_tags_match(set_tags, tags):
558
+ continue
559
+ if capsule_id and set_id != capsule_id:
560
+ continue
561
+
562
+ _filtered_capsules.append(capsule)
563
+ return _filtered_capsules
564
+
565
+ return _filter_capsules(
566
+ capsules, project, branch, name, tags, auth_type, capsule_id
567
+ )
568
+
569
+
570
+ from collections import namedtuple
571
+
572
+ CapsuleInfo = namedtuple("CapsuleInfo", ["info", "workers"])
573
+
574
+
575
+ class CapsuleDeployer:
576
+
577
+ status: CapsuleStateMachine
578
+
579
+ identifier = None
580
+
581
+ # TODO: Current default timeout is very large of 5 minutes. Ideally we should have finished the deployed in less than 1 minutes.
582
+ def __init__(
583
+ self,
584
+ app_config: AppConfig,
585
+ base_url: str,
586
+ create_timeout: int = 60 * 5,
587
+ debug_dir: Optional[str] = None,
588
+ success_terminal_state_condition: str = DEPLOYMENT_READY_CONDITIONS.ATLEAST_ONE_RUNNING,
589
+ readiness_wait_time: int = 20,
590
+ logger_fn=None,
591
+ ):
592
+ self._app_config = app_config
593
+ self._capsule_api = CapsuleApi(
594
+ base_url,
595
+ app_config.get_state("perimeter"),
596
+ logger_fn=logger_fn or partial(print, file=sys.stderr),
597
+ )
598
+ self._create_timeout = create_timeout
599
+ self._logger_fn = logger_fn
600
+ self._debug_dir = debug_dir
601
+ self._capsule_deploy_response = None
602
+ self._success_terminal_state_condition = success_terminal_state_condition
603
+ self._readiness_wait_time = readiness_wait_time
604
+
605
+ @property
606
+ def url(self):
607
+ return _format_url_string(
608
+ ({} or self._capsule_deploy_response).get("outOfClusterUrl", None)
609
+ )
610
+
611
+ @property
612
+ def capsule_api(self):
613
+ return self._capsule_api
614
+
615
+ @property
616
+ def capsule_type(self):
617
+ auth_type = self._app_config.get_state("auth", {}).get("type", AuthType.default)
618
+ if auth_type == AuthType.BROWSER:
619
+ return "App"
620
+ elif auth_type == AuthType.API:
621
+ return "Endpoint"
622
+ else:
623
+ raise TODOException(f"Unknown auth type: {auth_type}")
624
+
625
+ @property
626
+ def name(self):
627
+ return self._app_config.get_state("name")
628
+
629
+ def create_input(self):
630
+ return experimental.capsule_input_overrides(
631
+ self._app_config, CapsuleInput.from_app_config(self._app_config)
632
+ )
633
+
634
+ @property
635
+ def current_deployment_instance_version(self):
636
+ """
637
+ The backend `create` call returns a version of the object that will be
638
+ """
639
+ if self._capsule_deploy_response is None:
640
+ return None
641
+ return self._capsule_deploy_response.get("version", None)
642
+
643
+ def create(self):
644
+ capsule_response = self._capsule_api.create(self.create_input())
645
+ self.identifier = capsule_response.get("id")
646
+ self._capsule_deploy_response = capsule_response
647
+ return self.identifier
648
+
649
+ def get(self):
650
+ return self._capsule_api.get(self.identifier)
651
+
652
+ def get_workers(self):
653
+ return self._capsule_api.get_workers(self.identifier)
654
+
655
+ def _backend_version_mismatch_check(
656
+ self, capsule_response: dict, current_deployment_instance_version: str
657
+ ):
658
+ """
659
+ - `capsule_response.version` contains the version of the object present in the database
660
+ - `current_deployment_instance_version` contains the version of the object that was deployed by this instance of the deployer.
661
+ In the situtation that the versions of the objects become a mismatch then it means that current deployment process is not giving the user the
662
+ output that they desire.
663
+ """
664
+ if capsule_response.get("version", None) != current_deployment_instance_version:
665
+ raise CapsuleDeploymentException(
666
+ self.identifier, # type: ignore
667
+ f"A capsule upgrade was triggered outside current deployment instance. Current deployment version was discarded. Current deployment version: {current_deployment_instance_version} and new version: {capsule_response.get('version', None)}",
668
+ )
669
+
670
+ def _monitor_worker_readiness(
671
+ self,
672
+ workers_sm: "CapsuleWorkersStateMachine",
673
+ ):
674
+ """returns True if the worker is crashlooping, False otherwise"""
675
+ logger = self._logger_fn or partial(print, file=sys.stderr)
676
+ for i in range(self._readiness_wait_time):
677
+ time.sleep(1)
678
+ workers_response = self.get_workers()
679
+ workers_sm.add_status(workers_response)
680
+ workers_sm.report_current_status(logger)
681
+ if workers_sm.is_crashlooping:
682
+ return True
683
+ return False
684
+
685
+ def _extract_logs_from_crashlooping_worker(
686
+ self, workers_sm: "CapsuleWorkersStateMachine"
687
+ ):
688
+ def _extract_worker_id_of_crashlooping_worker(
689
+ workers_status: List[WorkerStatus],
690
+ ):
691
+ for worker in workers_status:
692
+ if worker["phase"] == "CrashLoopBackOff":
693
+ return worker["workerId"]
694
+ return None
695
+
696
+ worker_id = _extract_worker_id_of_crashlooping_worker(workers_sm.current_status)
697
+ if worker_id is None:
698
+ return None, None
699
+ logs = self.capsule_api.logs(self.identifier, worker_id, previous=True)
700
+ return logs, worker_id
701
+
702
+ def wait_for_terminal_state(
703
+ self,
704
+ ):
705
+ """ """
706
+ logger = self._logger_fn or partial(print, file=sys.stderr)
707
+ state_machine = CapsuleStateMachine(
708
+ self.identifier, self.current_deployment_instance_version
709
+ )
710
+ # min_replicas will always be present
711
+ min_replicas = self._app_config.get_state("replicas", {}).get("min")
712
+ workers_state_machine = CapsuleWorkersStateMachine(
713
+ self.identifier,
714
+ self.current_deployment_instance_version,
715
+ deployment_mode=self._success_terminal_state_condition,
716
+ minimum_replicas=min_replicas,
717
+ )
718
+ self.status = state_machine
719
+ for i in range(self._create_timeout):
720
+ time.sleep(1)
721
+ capsule_response = self.get()
722
+ workers_response = self.get_workers()
723
+
724
+ # We first need to check if someone has not upgraded the capsule under the hood and
725
+ # the current deployment instance is invalid.
726
+ self._backend_version_mismatch_check(
727
+ capsule_response, self.current_deployment_instance_version
728
+ )
729
+ state_machine.add_status(capsule_response.get("status", {}))
730
+ workers_state_machine.add_status(workers_response)
731
+ state_machine.report_current_status(logger)
732
+
733
+ workers_state_machine.report_current_status(logger)
734
+ # Deployment readiness checks will determine what is the terminal state
735
+ # of the workerstate machine. If we detect a terminal state in the workers,
736
+ # then even if the capsule upgrade is still in progress we will end up crashing
737
+ # the deployment.
738
+ (
739
+ capsule_ready,
740
+ further_check_worker_readiness,
741
+ ) = DEPLOYMENT_READY_CONDITIONS.check_readiness_condition(
742
+ state_machine.current_status,
743
+ workers_state_machine.current_version_deployment_status(),
744
+ self._success_terminal_state_condition,
745
+ )
746
+
747
+ failure_condition_satisfied = (
748
+ DEPLOYMENT_READY_CONDITIONS.check_failure_condition(
749
+ state_machine.current_status,
750
+ workers_state_machine.current_version_deployment_status(),
751
+ )
752
+ )
753
+ if capsule_ready or failure_condition_satisfied:
754
+ logger(
755
+ "💊 %s deployment status: %s | worker states: [success :%s | failure :%s ] "
756
+ % (
757
+ self.capsule_type.title(),
758
+ "in progress"
759
+ if state_machine.update_in_progress
760
+ else "completed",
761
+ capsule_ready,
762
+ failure_condition_satisfied,
763
+ )
764
+ )
765
+ _further_readiness_check_failed = False
766
+ if further_check_worker_readiness:
767
+ # HACK : monitor the workers for N seconds to make sure they are healthy
768
+ # this is a hack. Ideally we should implment a healtcheck as a first class citizen
769
+ # but it will take some time to do that so in the meanwhile a timeout set on the cli
770
+ # side will be really helpful.
771
+ logger(
772
+ "💊 running last minute readiness check for %s..."
773
+ % self.identifier
774
+ )
775
+ _further_readiness_check_failed = self._monitor_worker_readiness(
776
+ workers_state_machine
777
+ )
778
+
779
+ if CAPSULE_DEBUG:
780
+ logger(
781
+ f"[debug] 💊 {self.capsule_type} {self.identifier}: further_check_worker_readiness {_further_readiness_check_failed} | failure_condition_satisfied {failure_condition_satisfied}"
782
+ )
783
+
784
+ # We should still check for failure state and crash if we detect something in the readiness check
785
+ if failure_condition_satisfied or _further_readiness_check_failed:
786
+ # hit the logs endpoint for the worker and get the logs
787
+ # Print those logs out on the terminal
788
+ # raise an exception that should be caught gracefully by the cli
789
+ logs, worker_id = self._extract_logs_from_crashlooping_worker(
790
+ workers_state_machine
791
+ )
792
+ if logs is not None:
793
+ # todo: It would be really odd if the logs are not present and we discover something is crashlooping.
794
+ # Handle that condition later
795
+ logger(
796
+ *(
797
+ [
798
+ f"💥 Worker ID ({worker_id}) is crashlooping. Please check the following logs for more information: "
799
+ ]
800
+ + ["\t" + l["message"] for l in logs]
801
+ )
802
+ )
803
+ raise CapsuleDeploymentException(
804
+ self.identifier,
805
+ f"Worker ID ({worker_id}) is crashlooping. Please check the logs for more information.",
806
+ )
807
+
808
+ if state_machine.ready_to_serve_traffic:
809
+ logger(
810
+ "💊 %s %s is ready to serve traffic on the URL: %s"
811
+ % (
812
+ self.capsule_type,
813
+ self.identifier,
814
+ state_machine.out_of_cluster_url,
815
+ ),
816
+ )
817
+
818
+ break
819
+
820
+ if CAPSULE_DEBUG and self._debug_dir:
821
+ state_machine.save_debug_info(self._debug_dir)
822
+ workers_state_machine.save_debug_info(self._debug_dir)
823
+ if i % 3 == 0: # Every 3 seconds report the status
824
+ logger(
825
+ f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status: {state_machine.current_status} | worker states: {workers_state_machine.current_status}"
826
+ )
827
+
828
+ # We will only check ready_to_serve_traffic under the following conditions:
829
+ # If the readiness condition is not Async and min_replicas in this deployment
830
+ # instance is < 0
831
+ _is_async_readiness = (
832
+ self._success_terminal_state_condition == DEPLOYMENT_READY_CONDITIONS.ASYNC
833
+ )
834
+ if (
835
+ min_replicas > 0
836
+ and not _is_async_readiness
837
+ and not self.status.ready_to_serve_traffic
838
+ ):
839
+ raise CapsuleDeploymentException(
840
+ self.identifier,
841
+ f"Capsule {self.identifier} failed to be ready to serve traffic",
842
+ )
843
+
844
+ if CAPSULE_DEBUG and self._debug_dir:
845
+ state_machine.save_debug_info(self._debug_dir)
846
+ workers_state_machine.save_debug_info(self._debug_dir)
847
+ logger(
848
+ f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status [on return]: {state_machine.current_status} | worker states: {workers_state_machine.current_status}"
849
+ )
850
+
851
+ return dict(
852
+ id=self.identifier,
853
+ auth_type=self.capsule_type,
854
+ public_url=self.url,
855
+ available_replicas=self.status.available_replicas,
856
+ name=self.name,
857
+ deployed_version=self.current_deployment_instance_version,
858
+ deployed_at=datetime.now().isoformat(),
859
+ )