ob-metaflow-extensions 1.1.130__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (105) hide show
  1. metaflow_extensions/outerbounds/__init__.py +1 -1
  2. metaflow_extensions/outerbounds/plugins/__init__.py +34 -4
  3. metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
  4. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  5. metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
  6. metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
  7. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
  32. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  33. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  34. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  35. metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
  36. metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
  37. metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
  38. metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
  39. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
  40. metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +1 -1
  41. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
  42. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
  43. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  44. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
  45. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  46. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +43 -9
  47. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +12 -0
  48. metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
  49. metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
  50. metaflow_extensions/outerbounds/plugins/nim/card.py +2 -16
  51. metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
  52. metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
  53. metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
  54. metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
  55. metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +100 -19
  56. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +6 -1
  57. metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  58. metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
  59. metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
  60. metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
  61. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
  62. metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
  63. metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
  64. metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
  65. metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
  66. metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
  67. metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
  68. metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
  69. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
  70. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  71. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  72. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  73. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  74. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  75. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  76. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  77. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  78. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  79. metaflow_extensions/outerbounds/plugins/secrets/secrets.py +38 -2
  80. metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +81 -11
  81. metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +18 -8
  82. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +6 -0
  83. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +45 -18
  84. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +18 -9
  85. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +10 -4
  86. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
  87. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
  88. metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  89. metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  90. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  91. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
  92. metaflow_extensions/outerbounds/remote_config.py +46 -9
  93. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +94 -2
  94. metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
  95. metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
  96. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  97. metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  98. metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  99. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  100. {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
  101. ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
  102. metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
  103. ob_metaflow_extensions-1.1.130.dist-info/RECORD +0 -56
  104. {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
  105. {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,506 @@
1
+ import sys
2
+ from typing import TYPE_CHECKING, Dict, List, Tuple, Union
3
+
4
+
5
+ # on 3.8+ use the stdlib TypedDict;
6
+ # in TYPE_CHECKING blocks mypy/pyright still pick it up on older Pythons
7
+ if sys.version_info >= (3, 8):
8
+ from typing import TypedDict
9
+ else:
10
+ if TYPE_CHECKING:
11
+ # for the benefit of type-checkers
12
+ from typing import TypedDict # noqa: F401
13
+ # runtime no-op TypedDict shim
14
+ class _TypedDictMeta(type):
15
+ def __new__(cls, name, bases, namespace, total=True):
16
+ # ignore total at runtime
17
+ return super().__new__(cls, name, bases, namespace)
18
+
19
+ class TypedDict(dict, metaclass=_TypedDictMeta):
20
+ # Runtime stand-in for typing.TypedDict on <3.8.
21
+ pass
22
+
23
+
24
+ class _dagNode:
25
+ def __init__(self, name: str):
26
+ self.name = name
27
+ self.incoming_nodes: List["_dagNode"] = []
28
+ self.outgoing_nodes: List["_dagNode"] = []
29
+
30
+ def goto(self, *nodes: "_dagNode"):
31
+ for node in nodes:
32
+ self.outgoing_nodes.append(node)
33
+ node.incoming_nodes.append(self)
34
+ return self
35
+
36
+ def arrives_from(self, *nodes: "_dagNode"):
37
+ for node in nodes:
38
+ node.outgoing_nodes.append(self)
39
+ self.incoming_nodes.append(node)
40
+ return self
41
+
42
+ def __repr__(self):
43
+ return self.name
44
+
45
+ def __str__(self):
46
+ return self.name
47
+
48
+
49
+ class _capsuleDeployerStateMachine:
50
+ def __init__(self):
51
+ # -- (your existing setup) --
52
+ start_state = _dagNode("start")
53
+ fail_state = _dagNode("fail")
54
+ success_state = _dagNode("success")
55
+ upgrade_state = _dagNode("upgrade")
56
+ first_time_create_state = _dagNode("first_time_create")
57
+ end_state = _dagNode("end")
58
+
59
+ capsule_deploy_api_call = _dagNode("capsule_deploy_api_call")
60
+ capsule_deploy_api_call_rejected = _dagNode("capsule_deploy_api_call_rejected")
61
+ capsule_worker_pending = _dagNode("capsule_worker_pending")
62
+
63
+ capsule_single_worker_ready = _dagNode("capsule_single_worker_ready")
64
+ capsule_multiple_workers_ready = _dagNode("capsule_all_workers_ready")
65
+ current_deployment_deployed_worker_crashed = _dagNode(
66
+ "current_deployment_deployed_worker_crashed"
67
+ )
68
+ current_deployment_workers_pending_beyond_timeout = _dagNode(
69
+ "current_deployment_workers_pending_beyond_timeout"
70
+ )
71
+
72
+ start_state.goto(first_time_create_state, upgrade_state)
73
+
74
+ capsule_deploy_api_call.arrives_from(
75
+ first_time_create_state, upgrade_state
76
+ ).goto(capsule_deploy_api_call_rejected, capsule_worker_pending)
77
+
78
+ capsule_worker_pending.goto(
79
+ capsule_single_worker_ready,
80
+ capsule_multiple_workers_ready,
81
+ current_deployment_deployed_worker_crashed,
82
+ current_deployment_workers_pending_beyond_timeout,
83
+ )
84
+ success_state.arrives_from(
85
+ capsule_single_worker_ready, capsule_multiple_workers_ready
86
+ ).goto(end_state)
87
+ fail_state.arrives_from(
88
+ capsule_deploy_api_call_rejected,
89
+ current_deployment_deployed_worker_crashed,
90
+ current_deployment_workers_pending_beyond_timeout,
91
+ ).goto(end_state)
92
+
93
+ self._states = [
94
+ start_state,
95
+ fail_state,
96
+ success_state,
97
+ upgrade_state,
98
+ first_time_create_state,
99
+ end_state,
100
+ capsule_single_worker_ready,
101
+ capsule_multiple_workers_ready,
102
+ current_deployment_deployed_worker_crashed,
103
+ current_deployment_workers_pending_beyond_timeout,
104
+ capsule_deploy_api_call,
105
+ capsule_deploy_api_call_rejected,
106
+ capsule_worker_pending,
107
+ ]
108
+
109
+ def get_edges(self) -> List[Tuple["_dagNode", "_dagNode"]]:
110
+ """
111
+ Returns a list of (src_node, dst_node) tuples for all transitions.
112
+ """
113
+ edges = []
114
+ for node in self._states:
115
+ for out in node.outgoing_nodes:
116
+ edges.append((node, out))
117
+ return edges
118
+
119
+ def to_dot(self, graph_name="StateMachine"):
120
+ """
121
+ Emit a Graphviz DOT description of the state machine.
122
+ """
123
+ lines = [f"digraph {graph_name} {{"]
124
+ # optional: rankdir=LR for left-to-right layout
125
+ lines.append(" rankdir=LR;")
126
+ for src, dst in self.get_edges():
127
+ lines.append(f' "{src}" -> "{dst}";')
128
+ lines.append("}")
129
+ return "\n".join(lines)
130
+
131
+ def adjacency_list(self):
132
+ """
133
+ Returns a dict mapping each node to list of its outgoing nodes.
134
+ """
135
+ return {node: list(node.outgoing_nodes) for node in self._states}
136
+
137
+ def __str__(self):
138
+ # Default to DOT format; you could swap this out for something else
139
+ return self.to_dot()
140
+
141
+ def to_diagraph(self):
142
+ from graphviz import Digraph # type: ignore
143
+
144
+ # Create a new Digraph
145
+ dot = Digraph(name="StateMachine", format="png")
146
+ dot.attr(rankdir="LR") # left-to-right layout
147
+
148
+ # Add one edge per transition in your SM
149
+ for src, dst in self.get_edges():
150
+ # src and dst are _dagNode instances; use their .name (or str(src))
151
+ dot.edge(src.name, dst.name)
152
+
153
+ # Render to file (e.g. "state_machine.png") and optionally view it:
154
+ dot.render("state_machine", view=False)
155
+
156
+
157
+ class AccessInfo(TypedDict):
158
+ outOfClusterURL: str
159
+ inClusterURL: str
160
+
161
+
162
+ class CapsuleStatus(TypedDict):
163
+ availableReplicas: int
164
+ readyToServeTraffic: bool
165
+ accessInfo: AccessInfo
166
+ updateInProgress: bool
167
+ currentlyServedVersion: str
168
+
169
+
170
+ class WorkerStatus(TypedDict):
171
+ workerId: str
172
+ phase: str
173
+ activity: int
174
+ activityDataAvailable: bool
175
+ version: str
176
+
177
+
178
+ class WorkerInfoDict(TypedDict):
179
+ # TODO : Check if we need to account for the `Terminating` state
180
+ pending: Dict[str, List[WorkerStatus]]
181
+ running: Dict[str, List[WorkerStatus]]
182
+ crashlooping: Dict[str, List[WorkerStatus]]
183
+ failed: Dict[str, List[WorkerStatus]]
184
+
185
+
186
+ class CurrentWorkerInfo(TypedDict):
187
+ # TODO : Check if we need to account for the `Terminating` state
188
+ pending: int
189
+ running: int
190
+ crashlooping: int
191
+
192
+
193
+ class LogLine(TypedDict):
194
+ message: str
195
+
196
+
197
+ class DEPLOYMENT_READY_CONDITIONS:
198
+ """
199
+ Deployment ready conditions define what is considered a successful completion of the current deployment instance.
200
+ This allows users or platform designers to configure the criteria for deployment readiness.
201
+
202
+ Why do we need deployment readiness conditions?
203
+ - Deployments might be taking place from a CI/CD-esque environment, In these setups, the downstream build triggers might be depending on a specific criteria for deployment completion. Having readiness conditions allows the CI/CD systems to get a signal of when the deployment is ready.
204
+ - Users might be calling the deployment API under different conditions:
205
+ - Some users might want a cluster of workers ready before serving traffic while others might want just one worker ready to start serving traffic.
206
+
207
+ Some readiness conditions include:
208
+ 1) [at_least_one_running] At least min(min_replicas, 1) workers of the current deployment instance's version have started running.
209
+ - Usecase: Some endpoints may be deployed ephemerally and are considered ready when at least one instance is running; additional instances are for load management.
210
+ 2) [all_running] At least min_replicas number of workers are running for the deployment to be considered ready.
211
+ - Usecase: Operators may require that all replicas are available before traffic is routed. Needed when inference endpoints maybe under some SLA or require a larger load
212
+ 3) [fully_finished] At least min_replicas number of workers are running for the deployment and there are no pending or crashlooping workers from previous versions lying around.
213
+ - Usecase: Ensuring endpoint is fully available and no other versions are running or endpoint has been fully scaled down.
214
+ 4) [async] The deployment will be assumed ready as soon as the server acknowledges its registered the app in the backend.
215
+ - Usecase: Operators may only care that the URL is minted for the deployment or the operator wants the deployment to eventually scales down to 0.
216
+ """
217
+
218
+ # `ATLEAST_ONE_RUNNING` implies that at least one worker of the current deployment instance's version has started running.
219
+ ATLEAST_ONE_RUNNING = "at_least_one_running"
220
+
221
+ # `ALL_RUNNING` implies that all workers of the current deployment instance's version have started running (i.e. all workers aligning to the minimum number of replicas).
222
+ # It doesn't imply that all the workers relating to other deployments have been torn down.
223
+ ALL_RUNNING = "all_running"
224
+
225
+ # `FULLY_FINISHED` implies at least min_replicas number of workers are running for the deployment and there are no pending or crashlooping workers from previous versions lying around.
226
+ FULLY_FINISHED = "fully_finished"
227
+
228
+ # `ASYNC` implies that the deployment will be assumed ready after the URL is minted and the worker statuses are not checked.
229
+ ASYNC = "async"
230
+
231
+ @classmethod
232
+ def check_failure_condition(
233
+ cls,
234
+ capsule_status: CapsuleStatus,
235
+ worker_semantic_status: "CapsuleWorkerSemanticStatus",
236
+ ) -> bool:
237
+ """
238
+ Check if the deployment has failed based on the current capsule and worker status.
239
+ """
240
+ return worker_semantic_status["status"]["at_least_one_crashlooping"]
241
+
242
+ @classmethod
243
+ def check_readiness_condition(
244
+ cls,
245
+ capsule_status: CapsuleStatus,
246
+ worker_semantic_status: "CapsuleWorkerSemanticStatus",
247
+ readiness_condition: str,
248
+ ) -> Tuple[bool, bool]:
249
+ """
250
+ Check if the deployment readiness condition is satisfied based on current capsule and worker status.
251
+
252
+ This method evaluates whether a deployment has reached its desired ready state according to
253
+ the specified readiness condition. Different conditions have different criteria for what
254
+ constitutes a "ready" deployment.
255
+
256
+ Parameters
257
+ ----------
258
+ capsule_status : CapsuleStatus
259
+ The current status of the capsule deployment, including update progress information.
260
+ worker_semantic_status : CapsuleWorkerSemanticStatus
261
+ Semantic status information about the workers, including counts and states.
262
+ readiness_condition : str
263
+ The readiness condition to evaluate. Must be one of the class constants:
264
+ - ATLEAST_ONE_RUNNING: At least one worker is running and update is not in progress
265
+ - ALL_RUNNING: All required workers are running and update is not in progress
266
+ - FULLY_FINISHED: All workers running with no pending/crashlooping workers and update is not in progress
267
+ - ASYNC: Deployment is ready as soon as the backend responds with a 200 on create and provides a API URL.
268
+
269
+ Returns
270
+ -------
271
+ Tuple[bool, bool]
272
+ A tuple containing:
273
+ - First element: Boolean indicating if the readiness condition is satisfied
274
+ - Second element: Boolean indicating if additional worker readiness checks
275
+ should be performed (False for ASYNC mode, True for all others)
276
+
277
+ Raises
278
+ ------
279
+ ValueError
280
+ If an invalid readiness condition is provided.
281
+ """
282
+ _worker_readiness_check = True
283
+ _readiness_condition_satisfied = False
284
+ if readiness_condition == cls.ATLEAST_ONE_RUNNING:
285
+ _readiness_condition_satisfied = (
286
+ worker_semantic_status["status"]["at_least_one_running"]
287
+ and not capsule_status["updateInProgress"]
288
+ )
289
+ elif readiness_condition == cls.ALL_RUNNING:
290
+ _readiness_condition_satisfied = (
291
+ worker_semantic_status["status"]["all_running"]
292
+ and not capsule_status["updateInProgress"]
293
+ )
294
+ elif readiness_condition == cls.FULLY_FINISHED:
295
+ # We dont wait for updateInProgress in this condition since
296
+ # UpdateInProgress can switch to false when users scale all replicas down to 0.
297
+ # So for this condition to satisfy we will only rely on the worker semantic status.
298
+ # ie. the thing actually tracking what is running and what is not.
299
+ _readiness_condition_satisfied = worker_semantic_status["status"][
300
+ "fully_finished"
301
+ ]
302
+ elif readiness_condition == cls.ASYNC:
303
+ # The async readiness condition is satisfied immediately after the server responds
304
+ # with the URL.
305
+ _readiness_condition_satisfied = True
306
+ _worker_readiness_check = False
307
+ else:
308
+ raise ValueError(f"Invalid readiness condition: {readiness_condition}")
309
+
310
+ return _readiness_condition_satisfied, _worker_readiness_check
311
+
312
+ @classmethod
313
+ def docstring(cls):
314
+ return cls.__doc__
315
+
316
+ @classmethod
317
+ def enums(cls):
318
+ return [
319
+ cls.ATLEAST_ONE_RUNNING,
320
+ cls.ALL_RUNNING,
321
+ cls.FULLY_FINISHED,
322
+ cls.ASYNC,
323
+ ]
324
+
325
+
326
+ class CapsuleWorkerStatusDict(TypedDict):
327
+ at_least_one_pending: bool
328
+ at_least_one_running: bool
329
+ at_least_one_crashlooping: bool
330
+ all_running: bool
331
+ fully_finished: bool
332
+ none_present: bool
333
+ current_info: CurrentWorkerInfo
334
+
335
+
336
+ class CapsuleWorkerSemanticStatus(TypedDict):
337
+ final_version: str
338
+ status: CapsuleWorkerStatusDict
339
+ worker_info: WorkerInfoDict
340
+
341
+
342
+ def _capsule_worker_status_diff(
343
+ current_status: CapsuleWorkerSemanticStatus,
344
+ previous_status: Union[CapsuleWorkerSemanticStatus, None],
345
+ ) -> List[str]:
346
+ """
347
+ The goal of this function is to return a status string that will be used to update the user the
348
+ change in status of the different capsules.
349
+ """
350
+ if previous_status is None:
351
+ # Check if the current status has pending workers or crashlooping workers
352
+ curr = current_status["status"]["current_info"]
353
+ version = current_status["final_version"]
354
+ changes = []
355
+
356
+ if curr["pending"] > 0:
357
+ changes.append(f"⏳ {curr['pending']} worker(s) pending")
358
+
359
+ if curr["running"] > 0:
360
+ changes.append(f"🚀 {curr['running']} worker(s) currently running")
361
+
362
+ if curr["crashlooping"] > 0:
363
+ changes.append(f"💥 {curr['crashlooping']} worker(s) currently crashlooping")
364
+
365
+ return changes
366
+
367
+ curr = current_status["status"]["current_info"]
368
+ prev = previous_status["status"]["current_info"]
369
+ version = current_status["final_version"]
370
+
371
+ changes = []
372
+
373
+ # Track worker count changes for the target version
374
+ pending_diff = curr["pending"] - prev["pending"]
375
+ running_diff = curr["running"] - prev["running"]
376
+ crash_diff = curr["crashlooping"] - prev["crashlooping"]
377
+
378
+ # Worker count changes
379
+ if pending_diff > 0:
380
+ changes.append(
381
+ f"⏳ {pending_diff} new worker(s) pending. Total pending ({curr['pending']})"
382
+ )
383
+
384
+ if running_diff > 0:
385
+ changes.append(
386
+ f"🚀 {running_diff} worker(s) started running. Total running ({curr['running']})"
387
+ )
388
+ elif running_diff < 0:
389
+ changes.append(
390
+ f"🛑 {abs(running_diff)} worker(s) stopped running. Total running ({curr['running']})"
391
+ )
392
+
393
+ if crash_diff > 0:
394
+ changes.append(
395
+ f"💥 {crash_diff} worker(s) started crashlooping. Total crashlooping ({curr['crashlooping']})"
396
+ )
397
+ elif crash_diff < 0:
398
+ changes.append(f"🔧 {abs(crash_diff)} worker(s) recovered from crashlooping")
399
+
400
+ # Significant state transitions
401
+ if (
402
+ not previous_status["status"]["at_least_one_running"]
403
+ and current_status["status"]["at_least_one_running"]
404
+ ):
405
+ changes.append(f"✅ First worker came online")
406
+
407
+ if (
408
+ not previous_status["status"]["all_running"]
409
+ and current_status["status"]["all_running"]
410
+ ):
411
+ changes.append(f"🎉 All workers are now running")
412
+
413
+ if (
414
+ not previous_status["status"]["at_least_one_crashlooping"]
415
+ and current_status["status"]["at_least_one_crashlooping"]
416
+ ):
417
+ changes.append(f"⚠️ Worker crash detected")
418
+
419
+ # Current state summary
420
+
421
+ return changes
422
+
423
+
424
+ def _capsule_worker_semantic_status(
425
+ workers: List[WorkerStatus], version: str, min_replicas: int
426
+ ) -> CapsuleWorkerSemanticStatus:
427
+ def _filter_workers_by_phase(
428
+ workers: List[WorkerStatus], phase: str
429
+ ) -> List[WorkerStatus]:
430
+ return [w for w in workers if w.get("phase") == phase]
431
+
432
+ def _make_version_dict(
433
+ _workers: List[WorkerStatus], phase: str
434
+ ) -> Dict[str, List[WorkerStatus]]:
435
+ xx: Dict[str, List[WorkerStatus]] = {}
436
+ for w in _workers:
437
+ if w.get("phase") != phase:
438
+ continue
439
+ worker_version = w.get("version")
440
+ if worker_version is not None:
441
+ if worker_version not in xx:
442
+ xx[worker_version] = []
443
+ xx[worker_version].append(w)
444
+ return xx
445
+
446
+ # phases can be Pending, Running, Succeeded, Failed, Unknown, CrashLoopBackOff
447
+ pending_workers = _make_version_dict(workers, "Pending")
448
+ running_workers = _make_version_dict(workers, "Running")
449
+ crashlooping_workers = _make_version_dict(workers, "CrashLoopBackOff")
450
+ failed_workers = _make_version_dict(workers, "Failed")
451
+
452
+ # current_status (formulated basis):
453
+ # - at least one pods are pending for `_end_state_capsule_version`
454
+ # - at least one pod is in Running state for `_end_state_capsule_version` (maybe terminal) [Might require health-check thing here]
455
+ # - at least one pod is crashlooping for `_end_state_capsule_version` (maybe terminal)
456
+ # - all pods are running for `_end_state_capsule_version` that match the minimum number of replicas
457
+ # - all pods are running for `_end_state_capsule_version` that match the maximum number of replicas and no other pods of older versions are running
458
+ # - no pods relating to `_end_state_capsule_version` are pending/running/crashlooping
459
+
460
+ # Helper to count pods for the final version in each state
461
+ def count_for_version(workers_dict):
462
+ return len(workers_dict.get(version, []))
463
+
464
+ status_dict: CapsuleWorkerStatusDict = {
465
+ "at_least_one_pending": count_for_version(pending_workers) > 0,
466
+ # if min_replicas is 0, the at_least_one_running should be true for running worker count = 0
467
+ "at_least_one_running": (
468
+ count_for_version(running_workers) >= min(min_replicas, 1)
469
+ ),
470
+ "at_least_one_crashlooping": count_for_version(crashlooping_workers) > 0
471
+ or count_for_version(failed_workers) > 0,
472
+ "none_present": (
473
+ count_for_version(running_workers) == 0
474
+ and count_for_version(pending_workers) == 0
475
+ and count_for_version(crashlooping_workers) == 0
476
+ ),
477
+ "all_running": count_for_version(running_workers) >= min_replicas,
478
+ "fully_finished": (
479
+ count_for_version(running_workers) >= min_replicas
480
+ # count the workers of different versions that are runnning
481
+ # and ensure that only the current version's workers are running.
482
+ and count_for_version(running_workers)
483
+ == len(_filter_workers_by_phase(workers, "Running"))
484
+ and len(_filter_workers_by_phase(workers, "Pending")) == 0
485
+ and len(_filter_workers_by_phase(workers, "CrashLoopBackOff")) == 0
486
+ ),
487
+ "current_info": {
488
+ "pending": count_for_version(pending_workers),
489
+ "running": count_for_version(running_workers),
490
+ "crashlooping": count_for_version(crashlooping_workers),
491
+ "failed": count_for_version(failed_workers),
492
+ },
493
+ }
494
+
495
+ worker_info: WorkerInfoDict = {
496
+ "pending": pending_workers,
497
+ "running": running_workers,
498
+ "crashlooping": crashlooping_workers,
499
+ "failed": failed_workers,
500
+ }
501
+
502
+ return {
503
+ "final_version": version,
504
+ "status": status_dict,
505
+ "worker_info": worker_info,
506
+ }
@@ -0,0 +1,4 @@
1
+ __author__ = "Manraj Singh"
2
+ __email__ = "manrajsinghgrover@gmail.com"
3
+
4
+ from .spinners import Spinners