ob-metaflow-extensions 1.1.130__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/__init__.py +1 -1
- metaflow_extensions/outerbounds/plugins/__init__.py +34 -4
- metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
- metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
- metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
- metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +1 -1
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +43 -9
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +12 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
- metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
- metaflow_extensions/outerbounds/plugins/nim/card.py +2 -16
- metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +100 -19
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +6 -1
- metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
- metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
- metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
- metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
- metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
- metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/secrets/secrets.py +38 -2
- metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +81 -11
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +18 -8
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +6 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +45 -18
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +18 -9
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +10 -4
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
- metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
- metaflow_extensions/outerbounds/remote_config.py +46 -9
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +94 -2
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
- metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
- ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
- metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
- ob_metaflow_extensions-1.1.130.dist-info/RECORD +0 -56
- {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
# on 3.8+ use the stdlib TypedDict;
|
|
6
|
+
# in TYPE_CHECKING blocks mypy/pyright still pick it up on older Pythons
|
|
7
|
+
if sys.version_info >= (3, 8):
|
|
8
|
+
from typing import TypedDict
|
|
9
|
+
else:
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
# for the benefit of type-checkers
|
|
12
|
+
from typing import TypedDict # noqa: F401
|
|
13
|
+
# runtime no-op TypedDict shim
|
|
14
|
+
class _TypedDictMeta(type):
|
|
15
|
+
def __new__(cls, name, bases, namespace, total=True):
|
|
16
|
+
# ignore total at runtime
|
|
17
|
+
return super().__new__(cls, name, bases, namespace)
|
|
18
|
+
|
|
19
|
+
class TypedDict(dict, metaclass=_TypedDictMeta):
|
|
20
|
+
# Runtime stand-in for typing.TypedDict on <3.8.
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class _dagNode:
|
|
25
|
+
def __init__(self, name: str):
|
|
26
|
+
self.name = name
|
|
27
|
+
self.incoming_nodes: List["_dagNode"] = []
|
|
28
|
+
self.outgoing_nodes: List["_dagNode"] = []
|
|
29
|
+
|
|
30
|
+
def goto(self, *nodes: "_dagNode"):
|
|
31
|
+
for node in nodes:
|
|
32
|
+
self.outgoing_nodes.append(node)
|
|
33
|
+
node.incoming_nodes.append(self)
|
|
34
|
+
return self
|
|
35
|
+
|
|
36
|
+
def arrives_from(self, *nodes: "_dagNode"):
|
|
37
|
+
for node in nodes:
|
|
38
|
+
node.outgoing_nodes.append(self)
|
|
39
|
+
self.incoming_nodes.append(node)
|
|
40
|
+
return self
|
|
41
|
+
|
|
42
|
+
def __repr__(self):
|
|
43
|
+
return self.name
|
|
44
|
+
|
|
45
|
+
def __str__(self):
|
|
46
|
+
return self.name
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class _capsuleDeployerStateMachine:
|
|
50
|
+
def __init__(self):
|
|
51
|
+
# -- (your existing setup) --
|
|
52
|
+
start_state = _dagNode("start")
|
|
53
|
+
fail_state = _dagNode("fail")
|
|
54
|
+
success_state = _dagNode("success")
|
|
55
|
+
upgrade_state = _dagNode("upgrade")
|
|
56
|
+
first_time_create_state = _dagNode("first_time_create")
|
|
57
|
+
end_state = _dagNode("end")
|
|
58
|
+
|
|
59
|
+
capsule_deploy_api_call = _dagNode("capsule_deploy_api_call")
|
|
60
|
+
capsule_deploy_api_call_rejected = _dagNode("capsule_deploy_api_call_rejected")
|
|
61
|
+
capsule_worker_pending = _dagNode("capsule_worker_pending")
|
|
62
|
+
|
|
63
|
+
capsule_single_worker_ready = _dagNode("capsule_single_worker_ready")
|
|
64
|
+
capsule_multiple_workers_ready = _dagNode("capsule_all_workers_ready")
|
|
65
|
+
current_deployment_deployed_worker_crashed = _dagNode(
|
|
66
|
+
"current_deployment_deployed_worker_crashed"
|
|
67
|
+
)
|
|
68
|
+
current_deployment_workers_pending_beyond_timeout = _dagNode(
|
|
69
|
+
"current_deployment_workers_pending_beyond_timeout"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
start_state.goto(first_time_create_state, upgrade_state)
|
|
73
|
+
|
|
74
|
+
capsule_deploy_api_call.arrives_from(
|
|
75
|
+
first_time_create_state, upgrade_state
|
|
76
|
+
).goto(capsule_deploy_api_call_rejected, capsule_worker_pending)
|
|
77
|
+
|
|
78
|
+
capsule_worker_pending.goto(
|
|
79
|
+
capsule_single_worker_ready,
|
|
80
|
+
capsule_multiple_workers_ready,
|
|
81
|
+
current_deployment_deployed_worker_crashed,
|
|
82
|
+
current_deployment_workers_pending_beyond_timeout,
|
|
83
|
+
)
|
|
84
|
+
success_state.arrives_from(
|
|
85
|
+
capsule_single_worker_ready, capsule_multiple_workers_ready
|
|
86
|
+
).goto(end_state)
|
|
87
|
+
fail_state.arrives_from(
|
|
88
|
+
capsule_deploy_api_call_rejected,
|
|
89
|
+
current_deployment_deployed_worker_crashed,
|
|
90
|
+
current_deployment_workers_pending_beyond_timeout,
|
|
91
|
+
).goto(end_state)
|
|
92
|
+
|
|
93
|
+
self._states = [
|
|
94
|
+
start_state,
|
|
95
|
+
fail_state,
|
|
96
|
+
success_state,
|
|
97
|
+
upgrade_state,
|
|
98
|
+
first_time_create_state,
|
|
99
|
+
end_state,
|
|
100
|
+
capsule_single_worker_ready,
|
|
101
|
+
capsule_multiple_workers_ready,
|
|
102
|
+
current_deployment_deployed_worker_crashed,
|
|
103
|
+
current_deployment_workers_pending_beyond_timeout,
|
|
104
|
+
capsule_deploy_api_call,
|
|
105
|
+
capsule_deploy_api_call_rejected,
|
|
106
|
+
capsule_worker_pending,
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
def get_edges(self) -> List[Tuple["_dagNode", "_dagNode"]]:
|
|
110
|
+
"""
|
|
111
|
+
Returns a list of (src_node, dst_node) tuples for all transitions.
|
|
112
|
+
"""
|
|
113
|
+
edges = []
|
|
114
|
+
for node in self._states:
|
|
115
|
+
for out in node.outgoing_nodes:
|
|
116
|
+
edges.append((node, out))
|
|
117
|
+
return edges
|
|
118
|
+
|
|
119
|
+
def to_dot(self, graph_name="StateMachine"):
|
|
120
|
+
"""
|
|
121
|
+
Emit a Graphviz DOT description of the state machine.
|
|
122
|
+
"""
|
|
123
|
+
lines = [f"digraph {graph_name} {{"]
|
|
124
|
+
# optional: rankdir=LR for left-to-right layout
|
|
125
|
+
lines.append(" rankdir=LR;")
|
|
126
|
+
for src, dst in self.get_edges():
|
|
127
|
+
lines.append(f' "{src}" -> "{dst}";')
|
|
128
|
+
lines.append("}")
|
|
129
|
+
return "\n".join(lines)
|
|
130
|
+
|
|
131
|
+
def adjacency_list(self):
|
|
132
|
+
"""
|
|
133
|
+
Returns a dict mapping each node to list of its outgoing nodes.
|
|
134
|
+
"""
|
|
135
|
+
return {node: list(node.outgoing_nodes) for node in self._states}
|
|
136
|
+
|
|
137
|
+
def __str__(self):
|
|
138
|
+
# Default to DOT format; you could swap this out for something else
|
|
139
|
+
return self.to_dot()
|
|
140
|
+
|
|
141
|
+
def to_diagraph(self):
|
|
142
|
+
from graphviz import Digraph # type: ignore
|
|
143
|
+
|
|
144
|
+
# Create a new Digraph
|
|
145
|
+
dot = Digraph(name="StateMachine", format="png")
|
|
146
|
+
dot.attr(rankdir="LR") # left-to-right layout
|
|
147
|
+
|
|
148
|
+
# Add one edge per transition in your SM
|
|
149
|
+
for src, dst in self.get_edges():
|
|
150
|
+
# src and dst are _dagNode instances; use their .name (or str(src))
|
|
151
|
+
dot.edge(src.name, dst.name)
|
|
152
|
+
|
|
153
|
+
# Render to file (e.g. "state_machine.png") and optionally view it:
|
|
154
|
+
dot.render("state_machine", view=False)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class AccessInfo(TypedDict):
|
|
158
|
+
outOfClusterURL: str
|
|
159
|
+
inClusterURL: str
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class CapsuleStatus(TypedDict):
|
|
163
|
+
availableReplicas: int
|
|
164
|
+
readyToServeTraffic: bool
|
|
165
|
+
accessInfo: AccessInfo
|
|
166
|
+
updateInProgress: bool
|
|
167
|
+
currentlyServedVersion: str
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class WorkerStatus(TypedDict):
|
|
171
|
+
workerId: str
|
|
172
|
+
phase: str
|
|
173
|
+
activity: int
|
|
174
|
+
activityDataAvailable: bool
|
|
175
|
+
version: str
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class WorkerInfoDict(TypedDict):
|
|
179
|
+
# TODO : Check if we need to account for the `Terminating` state
|
|
180
|
+
pending: Dict[str, List[WorkerStatus]]
|
|
181
|
+
running: Dict[str, List[WorkerStatus]]
|
|
182
|
+
crashlooping: Dict[str, List[WorkerStatus]]
|
|
183
|
+
failed: Dict[str, List[WorkerStatus]]
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class CurrentWorkerInfo(TypedDict):
|
|
187
|
+
# TODO : Check if we need to account for the `Terminating` state
|
|
188
|
+
pending: int
|
|
189
|
+
running: int
|
|
190
|
+
crashlooping: int
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class LogLine(TypedDict):
|
|
194
|
+
message: str
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class DEPLOYMENT_READY_CONDITIONS:
|
|
198
|
+
"""
|
|
199
|
+
Deployment ready conditions define what is considered a successful completion of the current deployment instance.
|
|
200
|
+
This allows users or platform designers to configure the criteria for deployment readiness.
|
|
201
|
+
|
|
202
|
+
Why do we need deployment readiness conditions?
|
|
203
|
+
- Deployments might be taking place from a CI/CD-esque environment, In these setups, the downstream build triggers might be depending on a specific criteria for deployment completion. Having readiness conditions allows the CI/CD systems to get a signal of when the deployment is ready.
|
|
204
|
+
- Users might be calling the deployment API under different conditions:
|
|
205
|
+
- Some users might want a cluster of workers ready before serving traffic while others might want just one worker ready to start serving traffic.
|
|
206
|
+
|
|
207
|
+
Some readiness conditions include:
|
|
208
|
+
1) [at_least_one_running] At least min(min_replicas, 1) workers of the current deployment instance's version have started running.
|
|
209
|
+
- Usecase: Some endpoints may be deployed ephemerally and are considered ready when at least one instance is running; additional instances are for load management.
|
|
210
|
+
2) [all_running] At least min_replicas number of workers are running for the deployment to be considered ready.
|
|
211
|
+
- Usecase: Operators may require that all replicas are available before traffic is routed. Needed when inference endpoints maybe under some SLA or require a larger load
|
|
212
|
+
3) [fully_finished] At least min_replicas number of workers are running for the deployment and there are no pending or crashlooping workers from previous versions lying around.
|
|
213
|
+
- Usecase: Ensuring endpoint is fully available and no other versions are running or endpoint has been fully scaled down.
|
|
214
|
+
4) [async] The deployment will be assumed ready as soon as the server acknowledges its registered the app in the backend.
|
|
215
|
+
- Usecase: Operators may only care that the URL is minted for the deployment or the operator wants the deployment to eventually scales down to 0.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
# `ATLEAST_ONE_RUNNING` implies that at least one worker of the current deployment instance's version has started running.
|
|
219
|
+
ATLEAST_ONE_RUNNING = "at_least_one_running"
|
|
220
|
+
|
|
221
|
+
# `ALL_RUNNING` implies that all workers of the current deployment instance's version have started running (i.e. all workers aligning to the minimum number of replicas).
|
|
222
|
+
# It doesn't imply that all the workers relating to other deployments have been torn down.
|
|
223
|
+
ALL_RUNNING = "all_running"
|
|
224
|
+
|
|
225
|
+
# `FULLY_FINISHED` implies at least min_replicas number of workers are running for the deployment and there are no pending or crashlooping workers from previous versions lying around.
|
|
226
|
+
FULLY_FINISHED = "fully_finished"
|
|
227
|
+
|
|
228
|
+
# `ASYNC` implies that the deployment will be assumed ready after the URL is minted and the worker statuses are not checked.
|
|
229
|
+
ASYNC = "async"
|
|
230
|
+
|
|
231
|
+
@classmethod
|
|
232
|
+
def check_failure_condition(
|
|
233
|
+
cls,
|
|
234
|
+
capsule_status: CapsuleStatus,
|
|
235
|
+
worker_semantic_status: "CapsuleWorkerSemanticStatus",
|
|
236
|
+
) -> bool:
|
|
237
|
+
"""
|
|
238
|
+
Check if the deployment has failed based on the current capsule and worker status.
|
|
239
|
+
"""
|
|
240
|
+
return worker_semantic_status["status"]["at_least_one_crashlooping"]
|
|
241
|
+
|
|
242
|
+
@classmethod
|
|
243
|
+
def check_readiness_condition(
|
|
244
|
+
cls,
|
|
245
|
+
capsule_status: CapsuleStatus,
|
|
246
|
+
worker_semantic_status: "CapsuleWorkerSemanticStatus",
|
|
247
|
+
readiness_condition: str,
|
|
248
|
+
) -> Tuple[bool, bool]:
|
|
249
|
+
"""
|
|
250
|
+
Check if the deployment readiness condition is satisfied based on current capsule and worker status.
|
|
251
|
+
|
|
252
|
+
This method evaluates whether a deployment has reached its desired ready state according to
|
|
253
|
+
the specified readiness condition. Different conditions have different criteria for what
|
|
254
|
+
constitutes a "ready" deployment.
|
|
255
|
+
|
|
256
|
+
Parameters
|
|
257
|
+
----------
|
|
258
|
+
capsule_status : CapsuleStatus
|
|
259
|
+
The current status of the capsule deployment, including update progress information.
|
|
260
|
+
worker_semantic_status : CapsuleWorkerSemanticStatus
|
|
261
|
+
Semantic status information about the workers, including counts and states.
|
|
262
|
+
readiness_condition : str
|
|
263
|
+
The readiness condition to evaluate. Must be one of the class constants:
|
|
264
|
+
- ATLEAST_ONE_RUNNING: At least one worker is running and update is not in progress
|
|
265
|
+
- ALL_RUNNING: All required workers are running and update is not in progress
|
|
266
|
+
- FULLY_FINISHED: All workers running with no pending/crashlooping workers and update is not in progress
|
|
267
|
+
- ASYNC: Deployment is ready as soon as the backend responds with a 200 on create and provides a API URL.
|
|
268
|
+
|
|
269
|
+
Returns
|
|
270
|
+
-------
|
|
271
|
+
Tuple[bool, bool]
|
|
272
|
+
A tuple containing:
|
|
273
|
+
- First element: Boolean indicating if the readiness condition is satisfied
|
|
274
|
+
- Second element: Boolean indicating if additional worker readiness checks
|
|
275
|
+
should be performed (False for ASYNC mode, True for all others)
|
|
276
|
+
|
|
277
|
+
Raises
|
|
278
|
+
------
|
|
279
|
+
ValueError
|
|
280
|
+
If an invalid readiness condition is provided.
|
|
281
|
+
"""
|
|
282
|
+
_worker_readiness_check = True
|
|
283
|
+
_readiness_condition_satisfied = False
|
|
284
|
+
if readiness_condition == cls.ATLEAST_ONE_RUNNING:
|
|
285
|
+
_readiness_condition_satisfied = (
|
|
286
|
+
worker_semantic_status["status"]["at_least_one_running"]
|
|
287
|
+
and not capsule_status["updateInProgress"]
|
|
288
|
+
)
|
|
289
|
+
elif readiness_condition == cls.ALL_RUNNING:
|
|
290
|
+
_readiness_condition_satisfied = (
|
|
291
|
+
worker_semantic_status["status"]["all_running"]
|
|
292
|
+
and not capsule_status["updateInProgress"]
|
|
293
|
+
)
|
|
294
|
+
elif readiness_condition == cls.FULLY_FINISHED:
|
|
295
|
+
# We dont wait for updateInProgress in this condition since
|
|
296
|
+
# UpdateInProgress can switch to false when users scale all replicas down to 0.
|
|
297
|
+
# So for this condition to satisfy we will only rely on the worker semantic status.
|
|
298
|
+
# ie. the thing actually tracking what is running and what is not.
|
|
299
|
+
_readiness_condition_satisfied = worker_semantic_status["status"][
|
|
300
|
+
"fully_finished"
|
|
301
|
+
]
|
|
302
|
+
elif readiness_condition == cls.ASYNC:
|
|
303
|
+
# The async readiness condition is satisfied immediately after the server responds
|
|
304
|
+
# with the URL.
|
|
305
|
+
_readiness_condition_satisfied = True
|
|
306
|
+
_worker_readiness_check = False
|
|
307
|
+
else:
|
|
308
|
+
raise ValueError(f"Invalid readiness condition: {readiness_condition}")
|
|
309
|
+
|
|
310
|
+
return _readiness_condition_satisfied, _worker_readiness_check
|
|
311
|
+
|
|
312
|
+
@classmethod
|
|
313
|
+
def docstring(cls):
|
|
314
|
+
return cls.__doc__
|
|
315
|
+
|
|
316
|
+
@classmethod
|
|
317
|
+
def enums(cls):
|
|
318
|
+
return [
|
|
319
|
+
cls.ATLEAST_ONE_RUNNING,
|
|
320
|
+
cls.ALL_RUNNING,
|
|
321
|
+
cls.FULLY_FINISHED,
|
|
322
|
+
cls.ASYNC,
|
|
323
|
+
]
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class CapsuleWorkerStatusDict(TypedDict):
|
|
327
|
+
at_least_one_pending: bool
|
|
328
|
+
at_least_one_running: bool
|
|
329
|
+
at_least_one_crashlooping: bool
|
|
330
|
+
all_running: bool
|
|
331
|
+
fully_finished: bool
|
|
332
|
+
none_present: bool
|
|
333
|
+
current_info: CurrentWorkerInfo
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
class CapsuleWorkerSemanticStatus(TypedDict):
|
|
337
|
+
final_version: str
|
|
338
|
+
status: CapsuleWorkerStatusDict
|
|
339
|
+
worker_info: WorkerInfoDict
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _capsule_worker_status_diff(
|
|
343
|
+
current_status: CapsuleWorkerSemanticStatus,
|
|
344
|
+
previous_status: Union[CapsuleWorkerSemanticStatus, None],
|
|
345
|
+
) -> List[str]:
|
|
346
|
+
"""
|
|
347
|
+
The goal of this function is to return a status string that will be used to update the user the
|
|
348
|
+
change in status of the different capsules.
|
|
349
|
+
"""
|
|
350
|
+
if previous_status is None:
|
|
351
|
+
# Check if the current status has pending workers or crashlooping workers
|
|
352
|
+
curr = current_status["status"]["current_info"]
|
|
353
|
+
version = current_status["final_version"]
|
|
354
|
+
changes = []
|
|
355
|
+
|
|
356
|
+
if curr["pending"] > 0:
|
|
357
|
+
changes.append(f"⏳ {curr['pending']} worker(s) pending")
|
|
358
|
+
|
|
359
|
+
if curr["running"] > 0:
|
|
360
|
+
changes.append(f"🚀 {curr['running']} worker(s) currently running")
|
|
361
|
+
|
|
362
|
+
if curr["crashlooping"] > 0:
|
|
363
|
+
changes.append(f"💥 {curr['crashlooping']} worker(s) currently crashlooping")
|
|
364
|
+
|
|
365
|
+
return changes
|
|
366
|
+
|
|
367
|
+
curr = current_status["status"]["current_info"]
|
|
368
|
+
prev = previous_status["status"]["current_info"]
|
|
369
|
+
version = current_status["final_version"]
|
|
370
|
+
|
|
371
|
+
changes = []
|
|
372
|
+
|
|
373
|
+
# Track worker count changes for the target version
|
|
374
|
+
pending_diff = curr["pending"] - prev["pending"]
|
|
375
|
+
running_diff = curr["running"] - prev["running"]
|
|
376
|
+
crash_diff = curr["crashlooping"] - prev["crashlooping"]
|
|
377
|
+
|
|
378
|
+
# Worker count changes
|
|
379
|
+
if pending_diff > 0:
|
|
380
|
+
changes.append(
|
|
381
|
+
f"⏳ {pending_diff} new worker(s) pending. Total pending ({curr['pending']})"
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
if running_diff > 0:
|
|
385
|
+
changes.append(
|
|
386
|
+
f"🚀 {running_diff} worker(s) started running. Total running ({curr['running']})"
|
|
387
|
+
)
|
|
388
|
+
elif running_diff < 0:
|
|
389
|
+
changes.append(
|
|
390
|
+
f"🛑 {abs(running_diff)} worker(s) stopped running. Total running ({curr['running']})"
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
if crash_diff > 0:
|
|
394
|
+
changes.append(
|
|
395
|
+
f"💥 {crash_diff} worker(s) started crashlooping. Total crashlooping ({curr['crashlooping']})"
|
|
396
|
+
)
|
|
397
|
+
elif crash_diff < 0:
|
|
398
|
+
changes.append(f"🔧 {abs(crash_diff)} worker(s) recovered from crashlooping")
|
|
399
|
+
|
|
400
|
+
# Significant state transitions
|
|
401
|
+
if (
|
|
402
|
+
not previous_status["status"]["at_least_one_running"]
|
|
403
|
+
and current_status["status"]["at_least_one_running"]
|
|
404
|
+
):
|
|
405
|
+
changes.append(f"✅ First worker came online")
|
|
406
|
+
|
|
407
|
+
if (
|
|
408
|
+
not previous_status["status"]["all_running"]
|
|
409
|
+
and current_status["status"]["all_running"]
|
|
410
|
+
):
|
|
411
|
+
changes.append(f"🎉 All workers are now running")
|
|
412
|
+
|
|
413
|
+
if (
|
|
414
|
+
not previous_status["status"]["at_least_one_crashlooping"]
|
|
415
|
+
and current_status["status"]["at_least_one_crashlooping"]
|
|
416
|
+
):
|
|
417
|
+
changes.append(f"⚠️ Worker crash detected")
|
|
418
|
+
|
|
419
|
+
# Current state summary
|
|
420
|
+
|
|
421
|
+
return changes
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def _capsule_worker_semantic_status(
|
|
425
|
+
workers: List[WorkerStatus], version: str, min_replicas: int
|
|
426
|
+
) -> CapsuleWorkerSemanticStatus:
|
|
427
|
+
def _filter_workers_by_phase(
|
|
428
|
+
workers: List[WorkerStatus], phase: str
|
|
429
|
+
) -> List[WorkerStatus]:
|
|
430
|
+
return [w for w in workers if w.get("phase") == phase]
|
|
431
|
+
|
|
432
|
+
def _make_version_dict(
|
|
433
|
+
_workers: List[WorkerStatus], phase: str
|
|
434
|
+
) -> Dict[str, List[WorkerStatus]]:
|
|
435
|
+
xx: Dict[str, List[WorkerStatus]] = {}
|
|
436
|
+
for w in _workers:
|
|
437
|
+
if w.get("phase") != phase:
|
|
438
|
+
continue
|
|
439
|
+
worker_version = w.get("version")
|
|
440
|
+
if worker_version is not None:
|
|
441
|
+
if worker_version not in xx:
|
|
442
|
+
xx[worker_version] = []
|
|
443
|
+
xx[worker_version].append(w)
|
|
444
|
+
return xx
|
|
445
|
+
|
|
446
|
+
# phases can be Pending, Running, Succeeded, Failed, Unknown, CrashLoopBackOff
|
|
447
|
+
pending_workers = _make_version_dict(workers, "Pending")
|
|
448
|
+
running_workers = _make_version_dict(workers, "Running")
|
|
449
|
+
crashlooping_workers = _make_version_dict(workers, "CrashLoopBackOff")
|
|
450
|
+
failed_workers = _make_version_dict(workers, "Failed")
|
|
451
|
+
|
|
452
|
+
# current_status (formulated basis):
|
|
453
|
+
# - at least one pods are pending for `_end_state_capsule_version`
|
|
454
|
+
# - at least one pod is in Running state for `_end_state_capsule_version` (maybe terminal) [Might require health-check thing here]
|
|
455
|
+
# - at least one pod is crashlooping for `_end_state_capsule_version` (maybe terminal)
|
|
456
|
+
# - all pods are running for `_end_state_capsule_version` that match the minimum number of replicas
|
|
457
|
+
# - all pods are running for `_end_state_capsule_version` that match the maximum number of replicas and no other pods of older versions are running
|
|
458
|
+
# - no pods relating to `_end_state_capsule_version` are pending/running/crashlooping
|
|
459
|
+
|
|
460
|
+
# Helper to count pods for the final version in each state
|
|
461
|
+
def count_for_version(workers_dict):
|
|
462
|
+
return len(workers_dict.get(version, []))
|
|
463
|
+
|
|
464
|
+
status_dict: CapsuleWorkerStatusDict = {
|
|
465
|
+
"at_least_one_pending": count_for_version(pending_workers) > 0,
|
|
466
|
+
# if min_replicas is 0, the at_least_one_running should be true for running worker count = 0
|
|
467
|
+
"at_least_one_running": (
|
|
468
|
+
count_for_version(running_workers) >= min(min_replicas, 1)
|
|
469
|
+
),
|
|
470
|
+
"at_least_one_crashlooping": count_for_version(crashlooping_workers) > 0
|
|
471
|
+
or count_for_version(failed_workers) > 0,
|
|
472
|
+
"none_present": (
|
|
473
|
+
count_for_version(running_workers) == 0
|
|
474
|
+
and count_for_version(pending_workers) == 0
|
|
475
|
+
and count_for_version(crashlooping_workers) == 0
|
|
476
|
+
),
|
|
477
|
+
"all_running": count_for_version(running_workers) >= min_replicas,
|
|
478
|
+
"fully_finished": (
|
|
479
|
+
count_for_version(running_workers) >= min_replicas
|
|
480
|
+
# count the workers of different versions that are runnning
|
|
481
|
+
# and ensure that only the current version's workers are running.
|
|
482
|
+
and count_for_version(running_workers)
|
|
483
|
+
== len(_filter_workers_by_phase(workers, "Running"))
|
|
484
|
+
and len(_filter_workers_by_phase(workers, "Pending")) == 0
|
|
485
|
+
and len(_filter_workers_by_phase(workers, "CrashLoopBackOff")) == 0
|
|
486
|
+
),
|
|
487
|
+
"current_info": {
|
|
488
|
+
"pending": count_for_version(pending_workers),
|
|
489
|
+
"running": count_for_version(running_workers),
|
|
490
|
+
"crashlooping": count_for_version(crashlooping_workers),
|
|
491
|
+
"failed": count_for_version(failed_workers),
|
|
492
|
+
},
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
worker_info: WorkerInfoDict = {
|
|
496
|
+
"pending": pending_workers,
|
|
497
|
+
"running": running_workers,
|
|
498
|
+
"crashlooping": crashlooping_workers,
|
|
499
|
+
"failed": failed_workers,
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
return {
|
|
503
|
+
"final_version": version,
|
|
504
|
+
"status": status_dict,
|
|
505
|
+
"worker_info": worker_info,
|
|
506
|
+
}
|
|
File without changes
|