outerbounds 0.3.183rc1__py3-none-any.whl → 0.3.185__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,472 +0,0 @@
1
- from typing import List, Tuple, Dict, Union
2
-
3
-
4
- class _dagNode:
5
- def __init__(self, name: str):
6
- self.name = name
7
- self.incoming_nodes: List["_dagNode"] = []
8
- self.outgoing_nodes: List["_dagNode"] = []
9
-
10
- def goto(self, *nodes: "_dagNode"):
11
- for node in nodes:
12
- self.outgoing_nodes.append(node)
13
- node.incoming_nodes.append(self)
14
- return self
15
-
16
- def arrives_from(self, *nodes: "_dagNode"):
17
- for node in nodes:
18
- node.outgoing_nodes.append(self)
19
- self.incoming_nodes.append(node)
20
- return self
21
-
22
- def __repr__(self):
23
- return self.name
24
-
25
- def __str__(self):
26
- return self.name
27
-
28
-
29
- class _capsuleDeployerStateMachine:
30
- def __init__(self):
31
- # -- (your existing setup) --
32
- start_state = _dagNode("start")
33
- fail_state = _dagNode("fail")
34
- success_state = _dagNode("success")
35
- upgrade_state = _dagNode("upgrade")
36
- first_time_create_state = _dagNode("first_time_create")
37
- end_state = _dagNode("end")
38
-
39
- capsule_deploy_api_call = _dagNode("capsule_deploy_api_call")
40
- capsule_deploy_api_call_rejected = _dagNode("capsule_deploy_api_call_rejected")
41
- capsule_worker_pending = _dagNode("capsule_worker_pending")
42
-
43
- capsule_single_worker_ready = _dagNode("capsule_single_worker_ready")
44
- capsule_multiple_workers_ready = _dagNode("capsule_all_workers_ready")
45
- current_deployment_deployed_worker_crashed = _dagNode(
46
- "current_deployment_deployed_worker_crashed"
47
- )
48
- current_deployment_workers_pending_beyond_timeout = _dagNode(
49
- "current_deployment_workers_pending_beyond_timeout"
50
- )
51
-
52
- start_state.goto(first_time_create_state, upgrade_state)
53
-
54
- capsule_deploy_api_call.arrives_from(
55
- first_time_create_state, upgrade_state
56
- ).goto(capsule_deploy_api_call_rejected, capsule_worker_pending)
57
-
58
- capsule_worker_pending.goto(
59
- capsule_single_worker_ready,
60
- capsule_multiple_workers_ready,
61
- current_deployment_deployed_worker_crashed,
62
- current_deployment_workers_pending_beyond_timeout,
63
- )
64
- success_state.arrives_from(
65
- capsule_single_worker_ready, capsule_multiple_workers_ready
66
- ).goto(end_state)
67
- fail_state.arrives_from(
68
- capsule_deploy_api_call_rejected,
69
- current_deployment_deployed_worker_crashed,
70
- current_deployment_workers_pending_beyond_timeout,
71
- ).goto(end_state)
72
-
73
- self._states = [
74
- start_state,
75
- fail_state,
76
- success_state,
77
- upgrade_state,
78
- first_time_create_state,
79
- end_state,
80
- capsule_single_worker_ready,
81
- capsule_multiple_workers_ready,
82
- current_deployment_deployed_worker_crashed,
83
- current_deployment_workers_pending_beyond_timeout,
84
- capsule_deploy_api_call,
85
- capsule_deploy_api_call_rejected,
86
- capsule_worker_pending,
87
- ]
88
-
89
- def get_edges(self) -> List[Tuple["_dagNode", "_dagNode"]]:
90
- """
91
- Returns a list of (src_node, dst_node) tuples for all transitions.
92
- """
93
- edges = []
94
- for node in self._states:
95
- for out in node.outgoing_nodes:
96
- edges.append((node, out))
97
- return edges
98
-
99
- def to_dot(self, graph_name="StateMachine"):
100
- """
101
- Emit a Graphviz DOT description of the state machine.
102
- """
103
- lines = [f"digraph {graph_name} {{"]
104
- # optional: rankdir=LR for left-to-right layout
105
- lines.append(" rankdir=LR;")
106
- for src, dst in self.get_edges():
107
- lines.append(f' "{src}" -> "{dst}";')
108
- lines.append("}")
109
- return "\n".join(lines)
110
-
111
- def adjacency_list(self):
112
- """
113
- Returns a dict mapping each node to list of its outgoing nodes.
114
- """
115
- return {node: list(node.outgoing_nodes) for node in self._states}
116
-
117
- def __str__(self):
118
- # Default to DOT format; you could swap this out for something else
119
- return self.to_dot()
120
-
121
- def to_diagraph(self):
122
- from graphviz import Digraph # type: ignore
123
-
124
- # Create a new Digraph
125
- dot = Digraph(name="StateMachine", format="png")
126
- dot.attr(rankdir="LR") # left-to-right layout
127
-
128
- # Add one edge per transition in your SM
129
- for src, dst in self.get_edges():
130
- # src and dst are _dagNode instances; use their .name (or str(src))
131
- dot.edge(src.name, dst.name)
132
-
133
- # Render to file (e.g. "state_machine.png") and optionally view it:
134
- dot.render("state_machine", view=False)
135
-
136
-
137
- from typing import TypedDict
138
-
139
-
140
- class AccessInfo(TypedDict):
141
- outOfClusterURL: str
142
- inClusterURL: str
143
-
144
-
145
- class CapsuleStatus(TypedDict):
146
- availableReplicas: int
147
- readyToServeTraffic: bool
148
- accessInfo: AccessInfo
149
- updateInProgress: bool
150
- currentlyServedVersion: str
151
-
152
-
153
- class WorkerStatus(TypedDict):
154
- workerId: str
155
- phase: str
156
- activity: int
157
- activityDataAvailable: bool
158
- version: str
159
-
160
-
161
- from typing import Dict, List, TypedDict
162
-
163
-
164
- class WorkerInfoDict(TypedDict):
165
- # TODO : Check if we need to account for the `Terminating` state
166
- pending: Dict[str, List[WorkerStatus]]
167
- running: Dict[str, List[WorkerStatus]]
168
- crashlooping: Dict[str, List[WorkerStatus]]
169
-
170
-
171
- class CurrentWorkerInfo(TypedDict):
172
- # TODO : Check if we need to account for the `Terminating` state
173
- pending: int
174
- running: int
175
- crashlooping: int
176
-
177
-
178
- class DEPLOYMENT_READY_CONDITIONS:
179
- """
180
- Deployment ready conditions define what is considered a successful completion of the current deployment instance.
181
- This allows users or platform designers to configure the criteria for deployment readiness.
182
-
183
- Why do we need deployment readiness conditions?
184
- - Deployments might be taking place from a CI/CD esq environment, In these setups, the downstream build triggers might be depending on a specific criteria for deployment completion. Having readiness conditions allows the CI/CD systems to get a signal of when the deployment is ready.
185
- - Users might be calling the deployment API under different conditions:
186
- - Some users might want a cluster of workers ready before serving traffic while others might want just one worker ready to start serving traffic.
187
-
188
- Some readiness conditions include:
189
- 1) [at_least_one_running] Atleast min(min_replicas, 1) workers of the current deployment instance's version have started running.
190
- - Usecase: Some endpoints may be deployed ephemerally and are considered ready when at least one instance is running; additional instances are for load management.
191
- 2) [all_running] Atleast min_replicas number of workers are running for the deployment to be considered ready.
192
- - Usecase: Operators may require that all replicas are available before traffic is routed. Needed when inference endpoints maybe under some SLA or require a larger load
193
- 3) [fully_finished] Atleast min_replicas number of workers are running for the deployment and there are no pending or crashlooping workers from previous versions lying around.
194
- - Usecase: Ensuring endpoint is fully available and no other versions are running.
195
- 4) [async] The deployment will be assumed ready when the `upgradeInProgress` flag changes from `True` to `False`.
196
- - Usecase: Operators may only care that the URL is minted for the deployment and the deployment might also be setup with 0 min workers.
197
- """
198
-
199
- # `ATLEAST_ONE_RUNNING` implies that atleast one worker of the current deployment instance's version has started running.
200
- ATLEAST_ONE_RUNNING = "at_least_one_running"
201
-
202
- # `ALL_RUNNING` implies that all workers of the current deployment instance's version have started running (i.e. all workers aligning to the minimum number of replicas).
203
- # It doesn't imply that all the workers relating to other deployments have been torn down.
204
- ALL_RUNNING = "all_running"
205
-
206
- # `FULLY_FINISHED` implies that the deployment has the minimum number of replicas and all the workers are related to the current deployment instance's version.
207
- FULLY_FINISHED = "fully_finished"
208
-
209
- # `ASYNC` implies that the deployment will be assumed ready after the URL is minted and the worker statuses are not checked.
210
- ASYNC = "async"
211
-
212
- @classmethod
213
- def check_failure_condition(
214
- cls,
215
- capsule_status: CapsuleStatus,
216
- worker_semantic_status: "CapsuleWorkerSemanticStatus",
217
- ) -> bool:
218
- """
219
- Check if the deployment has failed based on the current capsule and worker status.
220
- """
221
- return worker_semantic_status["status"]["at_least_one_crashlooping"]
222
-
223
- @classmethod
224
- def check_readiness_condition(
225
- cls,
226
- capsule_status: CapsuleStatus,
227
- worker_semantic_status: "CapsuleWorkerSemanticStatus",
228
- readiness_condition: str,
229
- ) -> Tuple[bool, bool]:
230
- """
231
- Check if the deployment readiness condition is satisfied based on current capsule and worker status.
232
-
233
- This method evaluates whether a deployment has reached its desired ready state according to
234
- the specified readiness condition. Different conditions have different criteria for what
235
- constitutes a "ready" deployment.
236
-
237
- Parameters
238
- ----------
239
- capsule_status : CapsuleStatus
240
- The current status of the capsule deployment, including update progress information.
241
- worker_semantic_status : CapsuleWorkerSemanticStatus
242
- Semantic status information about the workers, including counts and states.
243
- readiness_condition : str
244
- The readiness condition to evaluate. Must be one of the class constants:
245
- - ATLEAST_ONE_RUNNING: At least one worker is running and update is not in progress
246
- - ALL_RUNNING: All required workers are running and update is not in progress
247
- - FULLY_FINISHED: All workers running with no pending/crashlooping workers and update is not in progress
248
- - ASYNC: Deployment is ready when the backend registers the current serving version in the capsule's status.
249
-
250
- Returns
251
- -------
252
- Tuple[bool, bool]
253
- A tuple containing:
254
- - First element: Boolean indicating if the readiness condition is satisfied
255
- - Second element: Boolean indicating if additional worker readiness checks
256
- should be performed (False for ASYNC mode, True for all others)
257
-
258
- Raises
259
- ------
260
- ValueError
261
- If an invalid readiness condition is provided.
262
- """
263
- _worker_readiness_check = True
264
- _readiness_condition_satisfied = False
265
- if readiness_condition == cls.ATLEAST_ONE_RUNNING:
266
- _readiness_condition_satisfied = (
267
- worker_semantic_status["status"]["at_least_one_running"]
268
- and not capsule_status["updateInProgress"]
269
- )
270
- elif readiness_condition == cls.ALL_RUNNING:
271
- _readiness_condition_satisfied = (
272
- worker_semantic_status["status"]["all_running"]
273
- and not capsule_status["updateInProgress"]
274
- )
275
- elif readiness_condition == cls.FULLY_FINISHED:
276
- _readiness_condition_satisfied = (
277
- worker_semantic_status["status"]["fully_finished"]
278
- and not capsule_status["updateInProgress"]
279
- )
280
- elif readiness_condition == cls.ASYNC:
281
- # The async readiness condition is satisfied when the currently served version is the same as the final version.
282
- _readiness_condition_satisfied = (
283
- capsule_status["currentlyServedVersion"]
284
- == worker_semantic_status["final_version"]
285
- )
286
- _worker_readiness_check = False
287
- else:
288
- raise ValueError(f"Invalid readiness condition: {readiness_condition}")
289
-
290
- return _readiness_condition_satisfied, _worker_readiness_check
291
-
292
- @classmethod
293
- def docstring(cls):
294
- return cls.__doc__
295
-
296
- @classmethod
297
- def enums(cls):
298
- return [
299
- cls.ATLEAST_ONE_RUNNING,
300
- cls.ALL_RUNNING,
301
- cls.FULLY_FINISHED,
302
- cls.ASYNC,
303
- ]
304
-
305
-
306
- class CapsuleWorkerStatusDict(TypedDict):
307
- at_least_one_pending: bool
308
- at_least_one_running: bool
309
- at_least_one_crashlooping: bool
310
- all_running: bool
311
- fully_finished: bool
312
- none_present: bool
313
- current_info: CurrentWorkerInfo
314
-
315
-
316
- class CapsuleWorkerSemanticStatus(TypedDict):
317
- final_version: str
318
- status: CapsuleWorkerStatusDict
319
- worker_info: WorkerInfoDict
320
-
321
-
322
- def _capsule_worker_status_diff(
323
- current_status: CapsuleWorkerSemanticStatus,
324
- previous_status: Union[CapsuleWorkerSemanticStatus, None],
325
- ) -> List[str]:
326
- """
327
- The goal of this function is to return a status string that will be used to update the user the
328
- change in status of the different capsules.
329
- """
330
- if previous_status is None:
331
- # Check if the current status has pending workers or crashlooping workers
332
- curr = current_status["status"]["current_info"]
333
- version = current_status["final_version"]
334
- changes = []
335
-
336
- if curr["pending"] > 0:
337
- changes.append(f"⏳ {curr['pending']} worker(s) pending")
338
-
339
- if curr["running"] > 0:
340
- changes.append(f"🚀 {curr['running']} worker(s) currently running")
341
-
342
- if curr["crashlooping"] > 0:
343
- changes.append(f"💥 {curr['crashlooping']} worker(s) currently crashlooping")
344
-
345
- return changes
346
-
347
- curr = current_status["status"]["current_info"]
348
- prev = previous_status["status"]["current_info"]
349
- version = current_status["final_version"]
350
-
351
- changes = []
352
-
353
- # Track worker count changes for the target version
354
- pending_diff = curr["pending"] - prev["pending"]
355
- running_diff = curr["running"] - prev["running"]
356
- crash_diff = curr["crashlooping"] - prev["crashlooping"]
357
-
358
- # Worker count changes
359
- if pending_diff > 0:
360
- changes.append(
361
- f"⏳ {pending_diff} new worker(s) pending. Total pending ({curr['pending']})"
362
- )
363
-
364
- if running_diff > 0:
365
- changes.append(
366
- f"🚀 {running_diff} worker(s) started running. Total running ({curr['running']})"
367
- )
368
- elif running_diff < 0:
369
- changes.append(
370
- f"🛑 {abs(running_diff)} worker(s) stopped running. Total running ({curr['running']})"
371
- )
372
-
373
- if crash_diff > 0:
374
- changes.append(
375
- f"💥 {crash_diff} worker(s) started crashlooping. Total crashlooping ({curr['crashlooping']})"
376
- )
377
- elif crash_diff < 0:
378
- changes.append(f"🔧 {abs(crash_diff)} worker(s) recovered from crashlooping")
379
-
380
- # Significant state transitions
381
- if (
382
- not previous_status["status"]["at_least_one_running"]
383
- and current_status["status"]["at_least_one_running"]
384
- ):
385
- changes.append(f"✅ First worker came online")
386
-
387
- if (
388
- not previous_status["status"]["all_running"]
389
- and current_status["status"]["all_running"]
390
- ):
391
- changes.append(f"🎉 All workers are now running")
392
-
393
- if (
394
- not previous_status["status"]["at_least_one_crashlooping"]
395
- and current_status["status"]["at_least_one_crashlooping"]
396
- ):
397
- changes.append(f"⚠️ Worker crash detected")
398
-
399
- # Current state summary
400
-
401
- return changes
402
-
403
-
404
- def _capsule_worker_semantic_status(
405
- workers: List[WorkerStatus], version: str, min_replicas: int
406
- ) -> CapsuleWorkerSemanticStatus:
407
- def _make_version_dict(
408
- _workers: List[WorkerStatus], phase: str
409
- ) -> Dict[str, List[WorkerStatus]]:
410
- xx: Dict[str, List[WorkerStatus]] = {}
411
- for w in _workers:
412
- if w.get("phase") != phase:
413
- continue
414
- worker_version = w.get("version")
415
- if worker_version is not None:
416
- if worker_version not in xx:
417
- xx[worker_version] = []
418
- xx[worker_version].append(w)
419
- return xx
420
-
421
- pending_workers = _make_version_dict(workers, "Pending")
422
- running_workers = _make_version_dict(workers, "Running")
423
- crashlooping_workers = _make_version_dict(workers, "CrashLoopBackOff")
424
-
425
- # current_status (formulated basis):
426
- # - atleast one pods are pending for `_end_state_capsule_version`
427
- # - atleast one pod is in Running state for `_end_state_capsule_version` (maybe terminal) [Might require heath-check thing here]
428
- # - alteast one pod is crashlooping for `_end_state_capsule_version` (maybe terminal)
429
- # - all pods are running for `_end_state_capsule_version` that match the minimum number of replicas
430
- # - all pods are running for `_end_state_capsule_version` that match the maximum number of replicas and no other pods of older versions are running
431
- # - no pods relating to `_end_state_capsule_version` are pending/running/crashlooping
432
-
433
- # Helper to count pods for the final version in each state
434
- def count_for_version(workers_dict):
435
- return len(workers_dict.get(version, []))
436
-
437
- status_dict: CapsuleWorkerStatusDict = {
438
- "at_least_one_pending": count_for_version(pending_workers) > 0,
439
- # if min_replicas is 0, the at_least_one_running should be true for running worker count = 0
440
- "at_least_one_running": (
441
- count_for_version(running_workers) >= min(min_replicas, 1)
442
- ),
443
- "at_least_one_crashlooping": count_for_version(crashlooping_workers) > 0,
444
- "none_present": (
445
- count_for_version(running_workers) == 0
446
- and count_for_version(pending_workers) == 0
447
- and count_for_version(crashlooping_workers) == 0
448
- ),
449
- "all_running": count_for_version(running_workers) >= min_replicas,
450
- "fully_finished": (
451
- count_for_version(running_workers) >= min_replicas
452
- and len(pending_workers) == 0
453
- and len(crashlooping_workers) == 0
454
- ),
455
- "current_info": {
456
- "pending": count_for_version(pending_workers),
457
- "running": count_for_version(running_workers),
458
- "crashlooping": count_for_version(crashlooping_workers),
459
- },
460
- }
461
-
462
- worker_info: WorkerInfoDict = {
463
- "pending": pending_workers,
464
- "running": running_workers,
465
- "crashlooping": crashlooping_workers,
466
- }
467
-
468
- return {
469
- "final_version": version,
470
- "status": status_dict,
471
- "worker_info": worker_info,
472
- }