torchx-nightly 2023.3.5__py3-none-any.whl → 2023.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

@@ -13,10 +13,9 @@ components on a Kubernetes cluster via the Multi-Cluster-Application-Dispatcher
13
13
  Prerequisites
14
14
  ==============
15
15
 
16
- TorchX kubernetes scheduler depends on AppWrapper + MCAD.
17
-
18
- Install MCAD
16
+ TorchX Kubernetes_MCAD scheduler depends on AppWrapper + MCAD.
19
17
 
18
+ Install MCAD:
20
19
  See deploying Multi-Cluster-Application-Dispatcher guide
21
20
  https://github.com/project-codeflare/multi-cluster-app-dispatcher/blob/main/doc/deploy/deployment.md
22
21
 
@@ -168,6 +167,7 @@ def role_to_pod(
168
167
  role: Role,
169
168
  service_account: Optional[str],
170
169
  image_secret: Optional[str],
170
+ coscheduler_name: Optional[str],
171
171
  ) -> "V1Pod":
172
172
  from kubernetes.client.models import ( # noqa: F811 redefinition of unused
173
173
  V1Container,
@@ -338,6 +338,7 @@ def role_to_pod(
338
338
  service_account_name=service_account,
339
339
  volumes=volumes,
340
340
  node_selector=node_selector,
341
+ scheduler_name=coscheduler_name,
341
342
  ),
342
343
  metadata=V1ObjectMeta(
343
344
  name=name,
@@ -352,6 +353,31 @@ def role_to_pod(
352
353
  )
353
354
 
354
355
 
356
+ def create_pod_group(role: Role, namespace: str, app_id: str) -> "Dict[str, Any]":
357
+ pod_group_name = app_id + "-" + cleanup_str(role.name) + "-pg"
358
+
359
+ pod_group: Dict[str, Any] = {
360
+ "apiVersion": "scheduling.sigs.k8s.io/v1alpha1",
361
+ "kind": "PodGroup",
362
+ "metadata": {
363
+ "name": pod_group_name,
364
+ "namespace": namespace,
365
+ "labels": {
366
+ "appwrapper.mcad.ibm.com": app_id,
367
+ },
368
+ },
369
+ "spec": {
370
+ "minMember": role.num_replicas,
371
+ },
372
+ }
373
+
374
+ genericitem_pod_group: Dict[str, Any] = {
375
+ "replicas": 1,
376
+ "generictemplate": pod_group,
377
+ }
378
+ return genericitem_pod_group
379
+
380
+
355
381
  def mcad_svc(svc_name: str, namespace: str, service_port: str) -> "V1Service":
356
382
  from kubernetes.client.models import ( # noqa: F401, F811
357
383
  V1Container,
@@ -436,6 +462,7 @@ def app_to_resource(
436
462
  namespace: str,
437
463
  service_account: Optional[str],
438
464
  image_secret: Optional[str],
465
+ coscheduler_name: Optional[str],
439
466
  priority: Optional[int] = None,
440
467
  ) -> Dict[str, Any]:
441
468
  """
@@ -448,6 +475,12 @@ def app_to_resource(
448
475
  genericitems = []
449
476
 
450
477
  unique_app_id = cleanup_str(make_unique(app.name))
478
+
479
+ if coscheduler_name is not None:
480
+ for role_idx, role in enumerate(app.roles):
481
+ genericitem_pod_group = create_pod_group(role, namespace, unique_app_id)
482
+ genericitems.append(genericitem_pod_group)
483
+
451
484
  for role_idx, role in enumerate(app.roles):
452
485
  for replica_id in range(role.num_replicas):
453
486
  values = macros.Values(
@@ -473,8 +506,18 @@ def app_to_resource(
473
506
  replica_role,
474
507
  service_account,
475
508
  image_secret,
509
+ coscheduler_name,
510
+ )
511
+ pod.metadata.labels.update(
512
+ pod_labels(
513
+ app=app,
514
+ role_idx=role_idx,
515
+ role=role,
516
+ replica_id=replica_id,
517
+ coscheduler_name=coscheduler_name,
518
+ app_id=unique_app_id,
519
+ )
476
520
  )
477
- pod.metadata.labels.update(pod_labels(app, role_idx, role, replica_id))
478
521
 
479
522
  genericitem: Dict[str, Any] = {
480
523
  "replicas": 1,
@@ -676,6 +719,7 @@ class KubernetesMCADOpts(TypedDict, total=False):
676
719
  service_account: Optional[str]
677
720
  priority: Optional[int]
678
721
  image_secret: Optional[str]
722
+ coscheduler_name: Optional[str]
679
723
 
680
724
 
681
725
  class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts]):
@@ -699,6 +743,14 @@ class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts
699
743
  $ torchx run --scheduler kubernetes_mcad --scheduler_args namespace=default,image_repo=<your_image_repo> utils.echo --image alpine:latest --msg hello
700
744
  ...
701
745
 
746
+ The TorchX-MCAD scheduler can be used with a secondary scheduler on Kubernetes.
747
+ To enable this, the user must provide the name of the coscheduler.
748
+ With this feature, a PodGroup is defined for each TorchX role and the coscheduler
749
+ handles secondary scheduling on the Kubernetes cluster. For additional resources, see:
750
+ 1. PodGroups and Coscheduling: https://github.com/kubernetes-sigs/scheduler-plugins/tree/release-1.24/pkg/coscheduling
751
+ 2. Installing Secondary schedulers: https://github.com/kubernetes-sigs/scheduler-plugins/blob/release-1.24/doc/install.md
752
+ 3. PodGroup CRD: https://github.com/kubernetes-sigs/scheduler-plugins/blob/release-1.24/config/crd/bases/scheduling.sigs.k8s.io_podgroups.yaml
753
+
702
754
  **Config Options**
703
755
 
704
756
  .. runopts::
@@ -861,9 +913,20 @@ class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts
861
913
  namespace = cfg.get("namespace")
862
914
  assert isinstance(namespace, str), "namespace must be a str"
863
915
 
916
+ coscheduler_name = cfg.get("coscheduler_name")
917
+ assert coscheduler_name is None or isinstance(
918
+ coscheduler_name, str
919
+ ), "coscheduler_name must be a string"
920
+
864
921
  resource = app_to_resource(
865
- app, namespace, service_account, image_secret, priority
922
+ app=app,
923
+ namespace=namespace,
924
+ service_account=service_account,
925
+ image_secret=image_secret,
926
+ coscheduler_name=coscheduler_name,
927
+ priority=priority,
866
928
  )
929
+
867
930
  req = KubernetesMCADJob(
868
931
  resource=resource,
869
932
  images_to_push=images_to_push,
@@ -917,6 +980,11 @@ class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts
917
980
  type_=str,
918
981
  help="The name of the Kubernetes/OpenShift secret set up for private images",
919
982
  )
983
+ opts.add(
984
+ "coscheduler_name",
985
+ type_=str,
986
+ help="Option to run TorchX-MCAD with a co-scheduler. User must provide the co-scheduler name.",
987
+ )
920
988
  return opts
921
989
 
922
990
  def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
@@ -1070,12 +1138,21 @@ def create_scheduler(session_name: str, **kwargs: Any) -> KubernetesMCADSchedule
1070
1138
 
1071
1139
  # TODO update to Kubernetes standard labels (https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/)
1072
1140
  def pod_labels(
1073
- app: AppDef, role_idx: int, role: Role, replica_id: int
1141
+ app: AppDef,
1142
+ role_idx: int,
1143
+ role: Role,
1144
+ replica_id: int,
1145
+ coscheduler_name: Optional[str],
1146
+ app_id: str,
1074
1147
  ) -> Dict[str, str]:
1075
- return {
1148
+ labels = {
1076
1149
  LABEL_VERSION: torchx.__version__,
1077
1150
  LABEL_APP_NAME: app.name,
1078
1151
  LABEL_ROLE_INDEX: str(role_idx),
1079
1152
  LABEL_ROLE_NAME: role.name,
1080
1153
  LABEL_REPLICA_ID: str(replica_id),
1081
1154
  }
1155
+ if coscheduler_name is not None:
1156
+ pod_group = app_id + "-" + cleanup_str(role.name) + "-pg"
1157
+ labels.update({"pod-group.scheduling.sigs.k8s.io": pod_group})
1158
+ return labels
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: torchx-nightly
3
- Version: 2023.3.5
3
+ Version: 2023.3.6
4
4
  Summary: TorchX SDK and Components
5
5
  Home-page: https://github.com/pytorch/torchx
6
6
  Author: TorchX Devs
@@ -68,7 +68,7 @@ torchx/schedulers/devices.py,sha256=PNbcpf8fEM18Ag1RgK9Q30zPBalEcPdsFWctdbLxuv8,
68
68
  torchx/schedulers/docker_scheduler.py,sha256=d7RjnZg_rMANXj5OPzob-mprxokVlG_iPVIMIl5L89c,15247
69
69
  torchx/schedulers/gcp_batch_scheduler.py,sha256=MNlZGjNZVd-E7m7sR5DN0BidRJOU0w8TZDpovdU3d9o,15993
70
70
  torchx/schedulers/ids.py,sha256=IGsJEbCYTdfKdU3MhKLQU6b7sWCJy5dlRV6JIL_9BlE,1783
71
- torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=BeUg827iSCPRJAJ4mU_rj7M4-Q-txHYqGrl8fnKO3rE,35493
71
+ torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=fFfcwC6x7fe1G48cwHaDQb9LNMk7PKoZ27gTNCnfPbg,38388
72
72
  torchx/schedulers/kubernetes_scheduler.py,sha256=4T1FtAhiIql4ZLZ2Xr55Z3nXCwAiv1Png8SXxP9c9ss,26345
73
73
  torchx/schedulers/local_scheduler.py,sha256=LKbWjw4cJuoryvaBjbHXVOVeTYOXaNDcKk61Grjg8Eg,37442
74
74
  torchx/schedulers/lsf_scheduler.py,sha256=SwstV-aBc8ZwAsjOViNiPfQX9iVLFmMDBba3Q2JglIE,17538
@@ -105,9 +105,9 @@ torchx/workspace/__init__.py,sha256=KbGEzJqqXaIxALm_EQO64aw-fE7MeDMFXcpU1mY650I,
105
105
  torchx/workspace/api.py,sha256=Ej6DR__mNWaVyZgoVNAAOloDy1kTD5X1jz7pRtoVf80,5464
106
106
  torchx/workspace/dir_workspace.py,sha256=Fz-hKIx0KN8iJf2BsthNj0NvTkWlxP6WFsElPs_BaT0,2253
107
107
  torchx/workspace/docker_workspace.py,sha256=Yd8ut26bNfjyJQnmH8ANOrflfr-4VKcnOrIjbi_XIUY,9208
108
- torchx_nightly-2023.3.5.dist-info/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
109
- torchx_nightly-2023.3.5.dist-info/METADATA,sha256=jKXF5g1zg3XSatC-RZY2D4XtXCkmW1EI7vXzCunVgoQ,5343
110
- torchx_nightly-2023.3.5.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
111
- torchx_nightly-2023.3.5.dist-info/entry_points.txt,sha256=3JYZFlX9aWzR-Gs_qsx1zq7mlqbFz6Mi9rQUULW8caI,170
112
- torchx_nightly-2023.3.5.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
113
- torchx_nightly-2023.3.5.dist-info/RECORD,,
108
+ torchx_nightly-2023.3.6.dist-info/LICENSE,sha256=e0Eotbf_rHOYPuEUlppIbvwy4SN98CZnl_hqwvbDA4Q,1530
109
+ torchx_nightly-2023.3.6.dist-info/METADATA,sha256=i2rs4GAytFuI0A3g93hkqIz3BKjfOBHhcp4XcTbwa3A,5343
110
+ torchx_nightly-2023.3.6.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
111
+ torchx_nightly-2023.3.6.dist-info/entry_points.txt,sha256=3JYZFlX9aWzR-Gs_qsx1zq7mlqbFz6Mi9rQUULW8caI,170
112
+ torchx_nightly-2023.3.6.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
113
+ torchx_nightly-2023.3.6.dist-info/RECORD,,