torchx-nightly 2023.3.5__py3-none-any.whl → 2023.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/schedulers/kubernetes_mcad_scheduler.py +84 -7
- {torchx_nightly-2023.3.5.dist-info → torchx_nightly-2023.3.6.dist-info}/METADATA +1 -1
- {torchx_nightly-2023.3.5.dist-info → torchx_nightly-2023.3.6.dist-info}/RECORD +7 -7
- {torchx_nightly-2023.3.5.dist-info → torchx_nightly-2023.3.6.dist-info}/LICENSE +0 -0
- {torchx_nightly-2023.3.5.dist-info → torchx_nightly-2023.3.6.dist-info}/WHEEL +0 -0
- {torchx_nightly-2023.3.5.dist-info → torchx_nightly-2023.3.6.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2023.3.5.dist-info → torchx_nightly-2023.3.6.dist-info}/top_level.txt +0 -0
|
@@ -13,10 +13,9 @@ components on a Kubernetes cluster via the Multi-Cluster-Application-Dispatcher
|
|
|
13
13
|
Prerequisites
|
|
14
14
|
==============
|
|
15
15
|
|
|
16
|
-
TorchX
|
|
17
|
-
|
|
18
|
-
Install MCAD
|
|
16
|
+
TorchX Kubernetes_MCAD scheduler depends on AppWrapper + MCAD.
|
|
19
17
|
|
|
18
|
+
Install MCAD:
|
|
20
19
|
See deploying Multi-Cluster-Application-Dispatcher guide
|
|
21
20
|
https://github.com/project-codeflare/multi-cluster-app-dispatcher/blob/main/doc/deploy/deployment.md
|
|
22
21
|
|
|
@@ -168,6 +167,7 @@ def role_to_pod(
|
|
|
168
167
|
role: Role,
|
|
169
168
|
service_account: Optional[str],
|
|
170
169
|
image_secret: Optional[str],
|
|
170
|
+
coscheduler_name: Optional[str],
|
|
171
171
|
) -> "V1Pod":
|
|
172
172
|
from kubernetes.client.models import ( # noqa: F811 redefinition of unused
|
|
173
173
|
V1Container,
|
|
@@ -338,6 +338,7 @@ def role_to_pod(
|
|
|
338
338
|
service_account_name=service_account,
|
|
339
339
|
volumes=volumes,
|
|
340
340
|
node_selector=node_selector,
|
|
341
|
+
scheduler_name=coscheduler_name,
|
|
341
342
|
),
|
|
342
343
|
metadata=V1ObjectMeta(
|
|
343
344
|
name=name,
|
|
@@ -352,6 +353,31 @@ def role_to_pod(
|
|
|
352
353
|
)
|
|
353
354
|
|
|
354
355
|
|
|
356
|
+
def create_pod_group(role: Role, namespace: str, app_id: str) -> "Dict[str, Any]":
|
|
357
|
+
pod_group_name = app_id + "-" + cleanup_str(role.name) + "-pg"
|
|
358
|
+
|
|
359
|
+
pod_group: Dict[str, Any] = {
|
|
360
|
+
"apiVersion": "scheduling.sigs.k8s.io/v1alpha1",
|
|
361
|
+
"kind": "PodGroup",
|
|
362
|
+
"metadata": {
|
|
363
|
+
"name": pod_group_name,
|
|
364
|
+
"namespace": namespace,
|
|
365
|
+
"labels": {
|
|
366
|
+
"appwrapper.mcad.ibm.com": app_id,
|
|
367
|
+
},
|
|
368
|
+
},
|
|
369
|
+
"spec": {
|
|
370
|
+
"minMember": role.num_replicas,
|
|
371
|
+
},
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
genericitem_pod_group: Dict[str, Any] = {
|
|
375
|
+
"replicas": 1,
|
|
376
|
+
"generictemplate": pod_group,
|
|
377
|
+
}
|
|
378
|
+
return genericitem_pod_group
|
|
379
|
+
|
|
380
|
+
|
|
355
381
|
def mcad_svc(svc_name: str, namespace: str, service_port: str) -> "V1Service":
|
|
356
382
|
from kubernetes.client.models import ( # noqa: F401, F811
|
|
357
383
|
V1Container,
|
|
@@ -436,6 +462,7 @@ def app_to_resource(
|
|
|
436
462
|
namespace: str,
|
|
437
463
|
service_account: Optional[str],
|
|
438
464
|
image_secret: Optional[str],
|
|
465
|
+
coscheduler_name: Optional[str],
|
|
439
466
|
priority: Optional[int] = None,
|
|
440
467
|
) -> Dict[str, Any]:
|
|
441
468
|
"""
|
|
@@ -448,6 +475,12 @@ def app_to_resource(
|
|
|
448
475
|
genericitems = []
|
|
449
476
|
|
|
450
477
|
unique_app_id = cleanup_str(make_unique(app.name))
|
|
478
|
+
|
|
479
|
+
if coscheduler_name is not None:
|
|
480
|
+
for role_idx, role in enumerate(app.roles):
|
|
481
|
+
genericitem_pod_group = create_pod_group(role, namespace, unique_app_id)
|
|
482
|
+
genericitems.append(genericitem_pod_group)
|
|
483
|
+
|
|
451
484
|
for role_idx, role in enumerate(app.roles):
|
|
452
485
|
for replica_id in range(role.num_replicas):
|
|
453
486
|
values = macros.Values(
|
|
@@ -473,8 +506,18 @@ def app_to_resource(
|
|
|
473
506
|
replica_role,
|
|
474
507
|
service_account,
|
|
475
508
|
image_secret,
|
|
509
|
+
coscheduler_name,
|
|
510
|
+
)
|
|
511
|
+
pod.metadata.labels.update(
|
|
512
|
+
pod_labels(
|
|
513
|
+
app=app,
|
|
514
|
+
role_idx=role_idx,
|
|
515
|
+
role=role,
|
|
516
|
+
replica_id=replica_id,
|
|
517
|
+
coscheduler_name=coscheduler_name,
|
|
518
|
+
app_id=unique_app_id,
|
|
519
|
+
)
|
|
476
520
|
)
|
|
477
|
-
pod.metadata.labels.update(pod_labels(app, role_idx, role, replica_id))
|
|
478
521
|
|
|
479
522
|
genericitem: Dict[str, Any] = {
|
|
480
523
|
"replicas": 1,
|
|
@@ -676,6 +719,7 @@ class KubernetesMCADOpts(TypedDict, total=False):
|
|
|
676
719
|
service_account: Optional[str]
|
|
677
720
|
priority: Optional[int]
|
|
678
721
|
image_secret: Optional[str]
|
|
722
|
+
coscheduler_name: Optional[str]
|
|
679
723
|
|
|
680
724
|
|
|
681
725
|
class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts]):
|
|
@@ -699,6 +743,14 @@ class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts
|
|
|
699
743
|
$ torchx run --scheduler kubernetes_mcad --scheduler_args namespace=default,image_repo=<your_image_repo> utils.echo --image alpine:latest --msg hello
|
|
700
744
|
...
|
|
701
745
|
|
|
746
|
+
The TorchX-MCAD scheduler can be used with a secondary scheduler on Kubernetes.
|
|
747
|
+
To enable this, the user must provide the name of the coscheduler.
|
|
748
|
+
With this feature, a PodGroup is defined for each TorchX role and the coscheduler
|
|
749
|
+
handles secondary scheduling on the Kubernetes cluster. For additional resources, see:
|
|
750
|
+
1. PodGroups and Coscheduling: https://github.com/kubernetes-sigs/scheduler-plugins/tree/release-1.24/pkg/coscheduling
|
|
751
|
+
2. Installing Secondary schedulers: https://github.com/kubernetes-sigs/scheduler-plugins/blob/release-1.24/doc/install.md
|
|
752
|
+
3. PodGroup CRD: https://github.com/kubernetes-sigs/scheduler-plugins/blob/release-1.24/config/crd/bases/scheduling.sigs.k8s.io_podgroups.yaml
|
|
753
|
+
|
|
702
754
|
**Config Options**
|
|
703
755
|
|
|
704
756
|
.. runopts::
|
|
@@ -861,9 +913,20 @@ class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts
|
|
|
861
913
|
namespace = cfg.get("namespace")
|
|
862
914
|
assert isinstance(namespace, str), "namespace must be a str"
|
|
863
915
|
|
|
916
|
+
coscheduler_name = cfg.get("coscheduler_name")
|
|
917
|
+
assert coscheduler_name is None or isinstance(
|
|
918
|
+
coscheduler_name, str
|
|
919
|
+
), "coscheduler_name must be a string"
|
|
920
|
+
|
|
864
921
|
resource = app_to_resource(
|
|
865
|
-
app,
|
|
922
|
+
app=app,
|
|
923
|
+
namespace=namespace,
|
|
924
|
+
service_account=service_account,
|
|
925
|
+
image_secret=image_secret,
|
|
926
|
+
coscheduler_name=coscheduler_name,
|
|
927
|
+
priority=priority,
|
|
866
928
|
)
|
|
929
|
+
|
|
867
930
|
req = KubernetesMCADJob(
|
|
868
931
|
resource=resource,
|
|
869
932
|
images_to_push=images_to_push,
|
|
@@ -917,6 +980,11 @@ class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts
|
|
|
917
980
|
type_=str,
|
|
918
981
|
help="The name of the Kubernetes/OpenShift secret set up for private images",
|
|
919
982
|
)
|
|
983
|
+
opts.add(
|
|
984
|
+
"coscheduler_name",
|
|
985
|
+
type_=str,
|
|
986
|
+
help="Option to run TorchX-MCAD with a co-scheduler. User must provide the co-scheduler name.",
|
|
987
|
+
)
|
|
920
988
|
return opts
|
|
921
989
|
|
|
922
990
|
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
@@ -1070,12 +1138,21 @@ def create_scheduler(session_name: str, **kwargs: Any) -> KubernetesMCADSchedule
|
|
|
1070
1138
|
|
|
1071
1139
|
# TODO update to Kubernetes standard labels (https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/)
|
|
1072
1140
|
def pod_labels(
|
|
1073
|
-
app: AppDef,
|
|
1141
|
+
app: AppDef,
|
|
1142
|
+
role_idx: int,
|
|
1143
|
+
role: Role,
|
|
1144
|
+
replica_id: int,
|
|
1145
|
+
coscheduler_name: Optional[str],
|
|
1146
|
+
app_id: str,
|
|
1074
1147
|
) -> Dict[str, str]:
|
|
1075
|
-
|
|
1148
|
+
labels = {
|
|
1076
1149
|
LABEL_VERSION: torchx.__version__,
|
|
1077
1150
|
LABEL_APP_NAME: app.name,
|
|
1078
1151
|
LABEL_ROLE_INDEX: str(role_idx),
|
|
1079
1152
|
LABEL_ROLE_NAME: role.name,
|
|
1080
1153
|
LABEL_REPLICA_ID: str(replica_id),
|
|
1081
1154
|
}
|
|
1155
|
+
if coscheduler_name is not None:
|
|
1156
|
+
pod_group = app_id + "-" + cleanup_str(role.name) + "-pg"
|
|
1157
|
+
labels.update({"pod-group.scheduling.sigs.k8s.io": pod_group})
|
|
1158
|
+
return labels
|
|
@@ -68,7 +68,7 @@ torchx/schedulers/devices.py,sha256=PNbcpf8fEM18Ag1RgK9Q30zPBalEcPdsFWctdbLxuv8,
|
|
|
68
68
|
torchx/schedulers/docker_scheduler.py,sha256=d7RjnZg_rMANXj5OPzob-mprxokVlG_iPVIMIl5L89c,15247
|
|
69
69
|
torchx/schedulers/gcp_batch_scheduler.py,sha256=MNlZGjNZVd-E7m7sR5DN0BidRJOU0w8TZDpovdU3d9o,15993
|
|
70
70
|
torchx/schedulers/ids.py,sha256=IGsJEbCYTdfKdU3MhKLQU6b7sWCJy5dlRV6JIL_9BlE,1783
|
|
71
|
-
torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=
|
|
71
|
+
torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=fFfcwC6x7fe1G48cwHaDQb9LNMk7PKoZ27gTNCnfPbg,38388
|
|
72
72
|
torchx/schedulers/kubernetes_scheduler.py,sha256=4T1FtAhiIql4ZLZ2Xr55Z3nXCwAiv1Png8SXxP9c9ss,26345
|
|
73
73
|
torchx/schedulers/local_scheduler.py,sha256=LKbWjw4cJuoryvaBjbHXVOVeTYOXaNDcKk61Grjg8Eg,37442
|
|
74
74
|
torchx/schedulers/lsf_scheduler.py,sha256=SwstV-aBc8ZwAsjOViNiPfQX9iVLFmMDBba3Q2JglIE,17538
|
|
@@ -105,9 +105,9 @@ torchx/workspace/__init__.py,sha256=KbGEzJqqXaIxALm_EQO64aw-fE7MeDMFXcpU1mY650I,
|
|
|
105
105
|
torchx/workspace/api.py,sha256=Ej6DR__mNWaVyZgoVNAAOloDy1kTD5X1jz7pRtoVf80,5464
|
|
106
106
|
torchx/workspace/dir_workspace.py,sha256=Fz-hKIx0KN8iJf2BsthNj0NvTkWlxP6WFsElPs_BaT0,2253
|
|
107
107
|
torchx/workspace/docker_workspace.py,sha256=Yd8ut26bNfjyJQnmH8ANOrflfr-4VKcnOrIjbi_XIUY,9208
|
|
108
|
-
torchx_nightly-2023.3.
|
|
109
|
-
torchx_nightly-2023.3.
|
|
110
|
-
torchx_nightly-2023.3.
|
|
111
|
-
torchx_nightly-2023.3.
|
|
112
|
-
torchx_nightly-2023.3.
|
|
113
|
-
torchx_nightly-2023.3.
|
|
108
|
+
torchx_nightly-2023.3.6.dist-info/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
|
|
109
|
+
torchx_nightly-2023.3.6.dist-info/METADATA,sha256=i2rs4GAytFuI0A3g93hkqIz3BKjfOBHhcp4XcTbwa3A,5343
|
|
110
|
+
torchx_nightly-2023.3.6.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
111
|
+
torchx_nightly-2023.3.6.dist-info/entry_points.txt,sha256=3JYZFlX9aWzR-Gs_qsx1zq7mlqbFz6Mi9rQUULW8caI,170
|
|
112
|
+
torchx_nightly-2023.3.6.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
113
|
+
torchx_nightly-2023.3.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|