ob-metaflow-extensions 1.3.1__py2.py3-none-any.whl → 1.3.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +7 -1
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +9 -2
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +5 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +34 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +4 -10
- {ob_metaflow_extensions-1.3.1.dist-info → ob_metaflow_extensions-1.3.2.dist-info}/METADATA +1 -1
- {ob_metaflow_extensions-1.3.1.dist-info → ob_metaflow_extensions-1.3.2.dist-info}/RECORD +10 -10
- {ob_metaflow_extensions-1.3.1.dist-info → ob_metaflow_extensions-1.3.2.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.3.1.dist-info → ob_metaflow_extensions-1.3.2.dist-info}/top_level.txt +0 -0
|
@@ -180,6 +180,7 @@ class WorkerInfoDict(TypedDict):
|
|
|
180
180
|
pending: Dict[str, List[WorkerStatus]]
|
|
181
181
|
running: Dict[str, List[WorkerStatus]]
|
|
182
182
|
crashlooping: Dict[str, List[WorkerStatus]]
|
|
183
|
+
failed: Dict[str, List[WorkerStatus]]
|
|
183
184
|
|
|
184
185
|
|
|
185
186
|
class CurrentWorkerInfo(TypedDict):
|
|
@@ -442,9 +443,11 @@ def _capsule_worker_semantic_status(
|
|
|
442
443
|
xx[worker_version].append(w)
|
|
443
444
|
return xx
|
|
444
445
|
|
|
446
|
+
# phases can be Pending, Running, Succeeded, Failed, Unknown, CrashLoopBackOff
|
|
445
447
|
pending_workers = _make_version_dict(workers, "Pending")
|
|
446
448
|
running_workers = _make_version_dict(workers, "Running")
|
|
447
449
|
crashlooping_workers = _make_version_dict(workers, "CrashLoopBackOff")
|
|
450
|
+
failed_workers = _make_version_dict(workers, "Failed")
|
|
448
451
|
|
|
449
452
|
# current_status (formulated basis):
|
|
450
453
|
# - at least one pods are pending for `_end_state_capsule_version`
|
|
@@ -464,7 +467,8 @@ def _capsule_worker_semantic_status(
|
|
|
464
467
|
"at_least_one_running": (
|
|
465
468
|
count_for_version(running_workers) >= min(min_replicas, 1)
|
|
466
469
|
),
|
|
467
|
-
"at_least_one_crashlooping": count_for_version(crashlooping_workers) > 0
|
|
470
|
+
"at_least_one_crashlooping": count_for_version(crashlooping_workers) > 0
|
|
471
|
+
or count_for_version(failed_workers) > 0,
|
|
468
472
|
"none_present": (
|
|
469
473
|
count_for_version(running_workers) == 0
|
|
470
474
|
and count_for_version(pending_workers) == 0
|
|
@@ -484,6 +488,7 @@ def _capsule_worker_semantic_status(
|
|
|
484
488
|
"pending": count_for_version(pending_workers),
|
|
485
489
|
"running": count_for_version(running_workers),
|
|
486
490
|
"crashlooping": count_for_version(crashlooping_workers),
|
|
491
|
+
"failed": count_for_version(failed_workers),
|
|
487
492
|
},
|
|
488
493
|
}
|
|
489
494
|
|
|
@@ -491,6 +496,7 @@ def _capsule_worker_semantic_status(
|
|
|
491
496
|
"pending": pending_workers,
|
|
492
497
|
"running": running_workers,
|
|
493
498
|
"crashlooping": crashlooping_workers,
|
|
499
|
+
"failed": failed_workers,
|
|
494
500
|
}
|
|
495
501
|
|
|
496
502
|
return {
|
|
@@ -255,6 +255,12 @@ class CapsuleInput:
|
|
|
255
255
|
replicas.get("min"),
|
|
256
256
|
replicas.get("max"),
|
|
257
257
|
)
|
|
258
|
+
rpm = replicas.get("scaling_policy", {}).get("rpm", None)
|
|
259
|
+
autoscaling_config = {}
|
|
260
|
+
if rpm:
|
|
261
|
+
autoscaling_config = {
|
|
262
|
+
"requestRateBasedAutoscalingConfig": {"targetRequestsPerMinute": rpm}
|
|
263
|
+
}
|
|
258
264
|
if fixed is not None:
|
|
259
265
|
_min, _max = fixed, fixed
|
|
260
266
|
gpu_resource = app_config.get_state("resources").get("gpu")
|
|
@@ -296,6 +302,7 @@ class CapsuleInput:
|
|
|
296
302
|
"autoscalingConfig": {
|
|
297
303
|
"minReplicas": _min,
|
|
298
304
|
"maxReplicas": _max,
|
|
305
|
+
**autoscaling_config,
|
|
299
306
|
},
|
|
300
307
|
**_scheduling_config,
|
|
301
308
|
"containerStartupConfig": {
|
|
@@ -713,7 +720,7 @@ class CapsuleDeployer:
|
|
|
713
720
|
workers_status: List[WorkerStatus],
|
|
714
721
|
):
|
|
715
722
|
for worker in workers_status:
|
|
716
|
-
if worker["phase"] == "CrashLoopBackOff":
|
|
723
|
+
if worker["phase"] == "CrashLoopBackOff" or worker["phase"] == "Failed":
|
|
717
724
|
return worker["workerId"]
|
|
718
725
|
return None
|
|
719
726
|
|
|
@@ -851,7 +858,7 @@ class CapsuleDeployer:
|
|
|
851
858
|
workers_state_machine.save_debug_info(self._debug_dir)
|
|
852
859
|
if i % 3 == 0: # Every 3 seconds report the status
|
|
853
860
|
logger(
|
|
854
|
-
f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status: {state_machine.current_status} | worker states: {workers_state_machine.current_status}"
|
|
861
|
+
f"[debug] 💊 {self.capsule_type} {self.identifier} deployment status: {state_machine.current_status} | worker states: {workers_state_machine.current_status} | capsule_ready : {capsule_ready} | further_check_worker_readiness {further_check_worker_readiness}"
|
|
855
862
|
)
|
|
856
863
|
|
|
857
864
|
# We will only check ready_to_serve_traffic under the following conditions:
|
|
@@ -51,6 +51,11 @@ class ReplicaConfigDict(TypedDict, total=False):
|
|
|
51
51
|
fixed: Optional[int]
|
|
52
52
|
min: Optional[int]
|
|
53
53
|
max: Optional[int]
|
|
54
|
+
scaling_policy: Optional["ScalingPolicyConfigDict"]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class ScalingPolicyConfigDict(TypedDict, total=False):
|
|
58
|
+
rpm: Optional[int]
|
|
54
59
|
|
|
55
60
|
|
|
56
61
|
class DependencyConfigDict(TypedDict, total=False):
|
|
@@ -301,6 +301,29 @@ class AuthConfig(metaclass=ConfigMeta):
|
|
|
301
301
|
)
|
|
302
302
|
|
|
303
303
|
|
|
304
|
+
class ScalingPolicyConfig(metaclass=ConfigMeta):
|
|
305
|
+
"""
|
|
306
|
+
Policies for autoscaling replicas. Available policies:
|
|
307
|
+
- Request based Autoscaling (rpm)
|
|
308
|
+
"""
|
|
309
|
+
|
|
310
|
+
# TODO Change the defaulting if we have more autoscaling policies.
|
|
311
|
+
rpm = ConfigField(
|
|
312
|
+
field_type=int,
|
|
313
|
+
# TODO: Add a little more to the docstring where we explain the behavior.
|
|
314
|
+
cli_meta=CLIOption(
|
|
315
|
+
name="scaling_rpm",
|
|
316
|
+
cli_option_str="--scaling-rpm",
|
|
317
|
+
help=(
|
|
318
|
+
"Scale up replicas when the requests per minute crosses this threshold. "
|
|
319
|
+
"If nothing is provided and the replicas.max and replicas.min is set then "
|
|
320
|
+
"the default rpm would be 60."
|
|
321
|
+
),
|
|
322
|
+
),
|
|
323
|
+
default=60,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
|
|
304
327
|
class ReplicaConfig(metaclass=ConfigMeta):
|
|
305
328
|
"""Replica configuration."""
|
|
306
329
|
|
|
@@ -333,6 +356,16 @@ class ReplicaConfig(metaclass=ConfigMeta):
|
|
|
333
356
|
example=10,
|
|
334
357
|
)
|
|
335
358
|
|
|
359
|
+
scaling_policy = ConfigField(
|
|
360
|
+
cli_meta=None,
|
|
361
|
+
field_type=ScalingPolicyConfig,
|
|
362
|
+
help=(
|
|
363
|
+
"Scaling policy defines the the metric based on which the replicas will horizontally scale. "
|
|
364
|
+
"If min and max replicas are set and are not the same, then a scaling policy will be applied. "
|
|
365
|
+
"Default scaling policies can be 60 rpm (ie 1 rps). "
|
|
366
|
+
),
|
|
367
|
+
)
|
|
368
|
+
|
|
336
369
|
@staticmethod
|
|
337
370
|
def defaults(replica_config: "ReplicaConfig"):
|
|
338
371
|
if all(
|
|
@@ -346,6 +379,7 @@ class ReplicaConfig(metaclass=ConfigMeta):
|
|
|
346
379
|
replica_config.fixed = 1
|
|
347
380
|
elif replica_config.min is not None and replica_config.max is None:
|
|
348
381
|
replica_config.max = replica_config.min
|
|
382
|
+
|
|
349
383
|
return
|
|
350
384
|
|
|
351
385
|
@staticmethod
|
|
@@ -161,6 +161,21 @@ properties:
|
|
|
161
161
|
type: integer
|
|
162
162
|
example: 10
|
|
163
163
|
mutation_behavior: union
|
|
164
|
+
scaling_policy:
|
|
165
|
+
title: ScalingPolicyConfig
|
|
166
|
+
description: |-
|
|
167
|
+
Policies for autoscaling replicas. Available policies:
|
|
168
|
+
- Request based Autoscaling (rpm)
|
|
169
|
+
type: object
|
|
170
|
+
required: []
|
|
171
|
+
properties:
|
|
172
|
+
rpm:
|
|
173
|
+
description: |-
|
|
174
|
+
Scale up replicas when the requests per minute crosses this threshold. If nothing is provided and the replicas.max and replicas.min is set then the default rpm would be 60.
|
|
175
|
+
type: integer
|
|
176
|
+
default: 60
|
|
177
|
+
mutation_behavior: union
|
|
178
|
+
mutation_behavior: union
|
|
164
179
|
mutation_behavior: union
|
|
165
180
|
dependencies:
|
|
166
181
|
title: DependencyConfig
|
|
@@ -151,7 +151,6 @@ class AppDeployer(TypedCoreConfig):
|
|
|
151
151
|
final_status["id"],
|
|
152
152
|
final_status["auth_type"],
|
|
153
153
|
final_status["public_url"],
|
|
154
|
-
final_status["available_replicas"],
|
|
155
154
|
final_status["name"],
|
|
156
155
|
final_status["deployed_version"],
|
|
157
156
|
final_status["deployed_at"],
|
|
@@ -164,7 +163,6 @@ class DeployedApp:
|
|
|
164
163
|
_id: str,
|
|
165
164
|
capsule_type: str,
|
|
166
165
|
public_url: str,
|
|
167
|
-
available_replicas: int,
|
|
168
166
|
name: str,
|
|
169
167
|
deployed_version: str,
|
|
170
168
|
deployed_at: str,
|
|
@@ -172,7 +170,6 @@ class DeployedApp:
|
|
|
172
170
|
self._id = _id
|
|
173
171
|
self._capsule_type = capsule_type
|
|
174
172
|
self._public_url = public_url
|
|
175
|
-
self._available_replicas = available_replicas
|
|
176
173
|
self._name = name
|
|
177
174
|
self._deployed_version = deployed_version
|
|
178
175
|
self._deployed_at = deployed_at
|
|
@@ -208,6 +205,10 @@ class DeployedApp:
|
|
|
208
205
|
capsule = capsule_api.get(self._id)
|
|
209
206
|
return capsule
|
|
210
207
|
|
|
208
|
+
def replicas(self):
|
|
209
|
+
capsule_api = self._get_capsule_api()
|
|
210
|
+
return capsule_api.get_workers(self._id)
|
|
211
|
+
|
|
211
212
|
def scale_to_zero(self):
|
|
212
213
|
"""
|
|
213
214
|
Scales the DeployedApp to 0 replicas.
|
|
@@ -243,10 +244,6 @@ class DeployedApp:
|
|
|
243
244
|
def public_url(self) -> str:
|
|
244
245
|
return self._public_url
|
|
245
246
|
|
|
246
|
-
@property
|
|
247
|
-
def available_replicas(self) -> int:
|
|
248
|
-
return self._available_replicas
|
|
249
|
-
|
|
250
247
|
@property
|
|
251
248
|
def name(self) -> str:
|
|
252
249
|
return self._name
|
|
@@ -260,7 +257,6 @@ class DeployedApp:
|
|
|
260
257
|
"id": self._id,
|
|
261
258
|
"auth_style": self.auth_style, # TODO : Fix naming here.
|
|
262
259
|
"public_url": self._public_url,
|
|
263
|
-
"available_replicas": self._available_replicas,
|
|
264
260
|
"name": self._name,
|
|
265
261
|
"deployed_version": self._deployed_version,
|
|
266
262
|
"deployed_at": self._deployed_at,
|
|
@@ -272,7 +268,6 @@ class DeployedApp:
|
|
|
272
268
|
_id=data["id"],
|
|
273
269
|
capsule_type=data["capsule_type"],
|
|
274
270
|
public_url=data["public_url"],
|
|
275
|
-
available_replicas=data["available_replicas"],
|
|
276
271
|
name=data["name"],
|
|
277
272
|
deployed_version=data["deployed_version"],
|
|
278
273
|
deployed_at=data["deployed_at"],
|
|
@@ -287,7 +282,6 @@ class DeployedApp:
|
|
|
287
282
|
f"DeployedApp(id='{self._id}', "
|
|
288
283
|
f"name='{self._name}', "
|
|
289
284
|
f"public_url='{self._public_url}', "
|
|
290
|
-
f"available_replicas={self._available_replicas}, "
|
|
291
285
|
f"deployed_version='{self._deployed_version}')"
|
|
292
286
|
)
|
|
293
287
|
|
|
@@ -12,15 +12,15 @@ metaflow_extensions/outerbounds/plugins/apps/consts.py,sha256=iHsyqbUg9k-rgswCs1
|
|
|
12
12
|
metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py,sha256=VkmiMdNYHhNdt-Qm9AVv7aE2LWFsIFEc16YcOYjwF6Q,8568
|
|
13
13
|
metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py,sha256=GQoN2gyPClcpR9cLldJmbCfqXnoAHxp8xUnY7vzaYtY,9026
|
|
14
14
|
metaflow_extensions/outerbounds/plugins/apps/core/__init__.py,sha256=c6uCgKlgEkTmM9BVdAO-m3vZvUpK2KW_AZZ2236now4,237
|
|
15
|
-
metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py,sha256=
|
|
15
|
+
metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py,sha256=al907t2C86BPf4V1V03PLTJRJMOc8gdl1CxLLbklnDU,20281
|
|
16
16
|
metaflow_extensions/outerbounds/plugins/apps/core/app_cli.py,sha256=V0Ki_VwjVIyIa2sgXPC7miOPLYWLrsHvzMpTfQypU2U,42169
|
|
17
17
|
metaflow_extensions/outerbounds/plugins/apps/core/app_config.py,sha256=PHt-HdNfTHIuhY-eB5vkRMp1RKQNWJ4DKdgZWyYgUuc,4167
|
|
18
18
|
metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
-
metaflow_extensions/outerbounds/plugins/apps/core/capsule.py,sha256=
|
|
19
|
+
metaflow_extensions/outerbounds/plugins/apps/core/capsule.py,sha256=VpCmq8R13GNex6aTJnOCswkLnc8acgsQQ9Da6KBh2sQ,34732
|
|
20
20
|
metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py,sha256=kgoPQmK_-8PSSTc3QMSaynCLQ5VWTkKFOC69FPURyXA,998
|
|
21
|
-
metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml,sha256=
|
|
21
|
+
metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml,sha256=LfA72d_bqsAuRzFZ9q-DfbiUy1mLimuFQfGwIEhoKNo,8745
|
|
22
22
|
metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py,sha256=JlWT9f27yzZeJPlqTQk134WDfQgOdyxC5iaw3pLlhqY,4006
|
|
23
|
-
metaflow_extensions/outerbounds/plugins/apps/core/deployer.py,sha256=
|
|
23
|
+
metaflow_extensions/outerbounds/plugins/apps/core/deployer.py,sha256=dNKlDu6n8SufEd5NKmsErl1RYhQXuEe_DgtA0mk7awg,9472
|
|
24
24
|
metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py,sha256=jeFGAUnFQkBFiOMp_Ls7Ofb80Qogh509suam5sMucYU,3030
|
|
25
25
|
metaflow_extensions/outerbounds/plugins/apps/core/secrets.py,sha256=sgDiAmpSC8Y5xjlaOEp79F6m0S3x4RONf_vJ5PUAfu8,6127
|
|
26
26
|
metaflow_extensions/outerbounds/plugins/apps/core/utils.py,sha256=2M2zU8DhbAlJee8P0xKXINAku81PcUylS3sVCSb0TUs,7896
|
|
@@ -35,9 +35,9 @@ metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py,sha256=ZgC9
|
|
|
35
35
|
metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py,sha256=0R0-wy7RxAMR9doVRvuluRYxAYgyjZXlTIkOeYGyz7M,5350
|
|
36
36
|
metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py,sha256=bozzUR8rbfOnb5M532RZxB5QNvVgEC1gnVjfCvQ82Yk,34053
|
|
37
37
|
metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py,sha256=tigPtb0we-urwbmctG1GbaQ9NKRKZn4KBbJKmaEntCg,9501
|
|
38
|
-
metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py,sha256=
|
|
38
|
+
metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py,sha256=euoS1Ap4yvHC20Aaj5YQWMgxixkxujVeiJ7C4DcAFhQ,4590
|
|
39
39
|
metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py,sha256=KiJ1eiwtBR5eWdBzWqvO6KlqJ2qzjJvl3w4c1uJ3g0Y,13419
|
|
40
|
-
metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py,sha256=
|
|
40
|
+
metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py,sha256=bO-g_6mv7xciVcDf4Jn-qioPUUvg9Y3fMM5fcraN2Sk,37018
|
|
41
41
|
metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py,sha256=rd4qGTkHndKYfJmoAKZWiY0KK4j5BK6RBrtle-it1Mg,2746
|
|
42
42
|
metaflow_extensions/outerbounds/plugins/aws/__init__.py,sha256=VBGdjNKeFLXGZuqh4jVk8cFtO1AWof73a6k_cnbAOYA,145
|
|
43
43
|
metaflow_extensions/outerbounds/plugins/aws/assume_role.py,sha256=mBewNlnSYsR2rFXFkX-DUH6ku01h2yOcMcLHoCL7eyI,161
|
|
@@ -124,7 +124,7 @@ metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py,sha256=GRSz2
|
|
|
124
124
|
metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py,sha256=LptpH-ziXHrednMYUjIaosS1SXD3sOtF_9_eRqd8SJw,50
|
|
125
125
|
metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py,sha256=uTVkdSk3xZ7hEKYfdlyVteWj5KeDwaM1hU9WT-_YKfI,50
|
|
126
126
|
metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py,sha256=ekcgD3KVydf-a0xMI60P4uy6ePkSEoFHiGnDq1JM940,45
|
|
127
|
-
ob_metaflow_extensions-1.3.
|
|
128
|
-
ob_metaflow_extensions-1.3.
|
|
129
|
-
ob_metaflow_extensions-1.3.
|
|
130
|
-
ob_metaflow_extensions-1.3.
|
|
127
|
+
ob_metaflow_extensions-1.3.2.dist-info/METADATA,sha256=6Q5Etz6OSCEQL-RBWyUWuJqcrHj8J92vfslA-ldZs4M,518
|
|
128
|
+
ob_metaflow_extensions-1.3.2.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
|
|
129
|
+
ob_metaflow_extensions-1.3.2.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
|
|
130
|
+
ob_metaflow_extensions-1.3.2.dist-info/RECORD,,
|
|
File without changes
|
{ob_metaflow_extensions-1.3.1.dist-info → ob_metaflow_extensions-1.3.2.dist-info}/top_level.txt
RENAMED
|
File without changes
|