xpk 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpk/commands/workload.py CHANGED
@@ -481,6 +481,16 @@ def workload_create(args) -> None:
481
481
  + lustre_storages
482
482
  )
483
483
 
484
+ use_sub_slicing = (
485
+ workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
486
+ )
487
+ use_super_slicing = (
488
+ workload_scheduling == WorkloadScheduling.SUPER_SLICING_AVAILABLE
489
+ )
490
+ parallel_containers = workload_system.parallel_containers
491
+ if args.use_pathways or use_super_slicing:
492
+ parallel_containers = 1
493
+
484
494
  # Currently failure policy rules are supported for Pathways workloads. b/408465881
485
495
  failure_policy_rules = ''
486
496
  pod_failure_policy = ''
@@ -497,10 +507,8 @@ def workload_create(args) -> None:
497
507
  rules:
498
508
  """
499
509
  docker_image = get_main_container_docker_image(args, workload_system)
500
- for i in range(workload_system.parallel_containers):
501
- docker_image_sufix = (
502
- f'-{i + 1}' if workload_system.parallel_containers > 1 else ''
503
- )
510
+ for i in range(parallel_containers):
511
+ docker_image_sufix = f'-{i + 1}' if parallel_containers > 1 else ''
504
512
  pod_failure_policy += f"""
505
513
  - action: FailJob
506
514
  onPodConditions: []
@@ -533,7 +541,7 @@ def workload_create(args) -> None:
533
541
  # Create the workload file based on accelerator type or workload type.
534
542
  if workload_system.accelerator_type == AcceleratorType.GPU:
535
543
  container, debugging_dashboard_id = get_user_workload_container(
536
- args, workload_system
544
+ args, workload_system, parallel_containers=parallel_containers
537
545
  )
538
546
  gpu_scheduler, return_code = get_gpu_scheduler(
539
547
  args, workload_system, autoprovisioning_args
@@ -624,25 +632,21 @@ def workload_create(args) -> None:
624
632
  custom_pathways_server=append_custom_pathways_server(args),
625
633
  custom_pathways_worker=append_custom_pathways_worker(args),
626
634
  colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
627
- user_workload=get_user_workload_for_pathways(args, workload_system),
635
+ user_workload=get_user_workload_for_pathways(
636
+ args, workload_system, parallel_containers
637
+ ),
628
638
  local_queue_name=LOCAL_QUEUE_NAME,
629
639
  autoprovisioning_args=autoprovisioning_args,
630
640
  placement_policy_label=placement_policy_label,
631
641
  )
632
642
  else:
633
- use_sub_slicing = (
634
- workload_scheduling == WorkloadScheduling.SUB_SLICING_AVAILABLE
635
- )
636
- use_super_slicing = (
637
- workload_scheduling == WorkloadScheduling.SUPER_SLICING_AVAILABLE
638
- )
639
643
  if use_sub_slicing:
640
644
  xpk_print('Workload will be scheduled using the Sub-slicing feature.')
641
645
  if use_super_slicing:
642
646
  xpk_print('Workload will be scheduled using the Super-slicing feature.')
643
647
 
644
648
  container, debugging_dashboard_id = get_user_workload_container(
645
- args, workload_system
649
+ args, workload_system, parallel_containers
646
650
  )
647
651
 
648
652
  machine_label = (
@@ -30,12 +30,18 @@ from .system_characteristics import (
30
30
  )
31
31
 
32
32
 
33
- def get_main_and_sidecar_container(args, system, docker_image) -> str:
33
+ def get_main_and_sidecar_container(
34
+ args,
35
+ system: SystemCharacteristics,
36
+ docker_image: str,
37
+ parallel_containers: int,
38
+ ) -> str:
34
39
  """Generate yaml for main and sidecar container.
35
40
  Args:
36
41
  args: user provided arguments for running the command.
37
42
  system: system characteristics
38
43
  docker_image: docker image
44
+ parallel_containers: number of containers to run per VM.
39
45
 
40
46
  Returns:
41
47
  str:
@@ -44,7 +50,9 @@ def get_main_and_sidecar_container(args, system, docker_image) -> str:
44
50
  resource_type = AcceleratorTypeToAcceleratorCharacteristics[
45
51
  system.accelerator_type
46
52
  ].resource_type
47
- main_container = get_main_container(args, system, docker_image, resource_type)
53
+ main_container = get_main_container(
54
+ args, system, docker_image, resource_type, parallel_containers
55
+ )
48
56
  yaml = """- name: stacktrace-explorer
49
57
  image: busybox:1.28
50
58
  args: [/bin/sh, -c, "check_signal() (while [ ! -f /shared-volume/stacktrace_signal ]; do sleep 1; done; pid=$(pidof 'tail'); kill $pid;); check_signal & while [ ! -d /tmp/debugging ]; do sleep 60; done; while [ ! -e /tmp/debugging/* ]; do sleep 60; done; tail -n+1 -f /tmp/debugging/*; exit 0;"]
@@ -59,13 +67,20 @@ def get_main_and_sidecar_container(args, system, docker_image) -> str:
59
67
  return yaml.format(main_container=main_container)
60
68
 
61
69
 
62
- def get_main_container(args, system, docker_image, resource_type) -> str:
70
+ def get_main_container(
71
+ args,
72
+ system: SystemCharacteristics,
73
+ docker_image: str,
74
+ resource_type,
75
+ parallel_containers: int,
76
+ ) -> str:
63
77
  """Generate yaml for main container including the xpk command.
64
78
  Args:
65
79
  args: user provided arguments for running the command.
66
80
  system: system characteristics
67
81
  docker_image: docker image
68
82
  resource_type: The label to describe the resource type for TPUs/GPUs/CPUs.
83
+ parallel_containers: number of containers to run per VM.
69
84
 
70
85
  Returns:
71
86
  str:
@@ -149,14 +164,10 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
149
164
  volumeMounts:
150
165
  {volume_mounts}
151
166
  """
152
- # pathways job running on 2 parallel containers is not verified yet
153
- if args.use_pathways:
154
- system.parallel_containers = 1
155
-
156
167
  env = get_env_container(args, system)
157
168
  image_pull_policy = add_image_pull_policy_for_pw_or_gpu(args, system)
158
- for i in range(system.parallel_containers):
159
- docker_name_sufix = f'-{i + 1}' if system.parallel_containers > 1 else ''
169
+ for i in range(parallel_containers):
170
+ docker_name_sufix = f'-{i + 1}' if parallel_containers > 1 else ''
160
171
  containers.append(
161
172
  container_yaml.format(
162
173
  args=args,
@@ -170,19 +181,24 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
170
181
  tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command,
171
182
  gpu_workload_terminate_command=gpu_workload_terminate_command,
172
183
  xpk_internal_commands=xpk_internal_commands,
173
- resources=get_main_container_resources(args, system, resource_type),
184
+ resources=get_main_container_resources(
185
+ args, system, resource_type, parallel_containers
186
+ ),
174
187
  volume_mounts=volume_mounts,
175
188
  )
176
189
  )
177
190
  return ''.join(containers)
178
191
 
179
192
 
180
- def get_user_workload_container(args, system: SystemCharacteristics):
193
+ def get_user_workload_container(
194
+ args, system: SystemCharacteristics, parallel_containers: int
195
+ ):
181
196
  """Deploy user workload container
182
197
 
183
198
  Args:
184
199
  args: user provided args.
185
200
  system: system characteristics.
201
+ parallel_containers: number of containers to run per VM.
186
202
 
187
203
  Returns:
188
204
  container: main container
@@ -209,11 +225,15 @@ def get_user_workload_container(args, system: SystemCharacteristics):
209
225
  'Sidecar container to display stack traces for TPU workloads will also'
210
226
  ' be deployed.'
211
227
  )
212
- container = get_main_and_sidecar_container(args, system, docker_image)
228
+ container = get_main_and_sidecar_container(
229
+ args, system, docker_image, parallel_containers
230
+ )
213
231
  # Get GKE debugging dashboard only when sidecar container is deployed for TPU workloads
214
232
  debugging_dashboard_id = get_gke_debugging_dashboard(args)
215
233
  else:
216
- container = get_main_container(args, system, docker_image, resource_type)
234
+ container = get_main_container(
235
+ args, system, docker_image, resource_type, parallel_containers
236
+ )
217
237
  return container, debugging_dashboard_id
218
238
 
219
239
 
@@ -23,7 +23,10 @@ from ..utils.execution_context import is_dry_run
23
23
 
24
24
 
25
25
  def get_main_container_resources(
26
- args, system: SystemCharacteristics, resource_type
26
+ args,
27
+ system: SystemCharacteristics,
28
+ resource_type: str,
29
+ parallel_containers: int,
27
30
  ) -> str:
28
31
  """Resources for the main container.
29
32
  Args:
@@ -53,10 +56,7 @@ def get_main_container_resources(
53
56
  offset_vCPUs = int(system.chips_per_vm) * 0.95
54
57
  return f'{resource_type}: {offset_vCPUs}'
55
58
 
56
- return (
57
- f'{resource_type}:'
58
- f' {int(system.chips_per_vm / system.parallel_containers)}'
59
- )
59
+ return f'{resource_type}: {int(system.chips_per_vm / parallel_containers)}'
60
60
 
61
61
 
62
62
  def get_env_container(args, system: SystemCharacteristics) -> str:
xpk/core/pathways.py CHANGED
@@ -245,18 +245,12 @@ def append_custom_colocated_python_sidecar(args) -> str:
245
245
 
246
246
 
247
247
  def get_user_workload_for_pathways(
248
- args,
249
- system: SystemCharacteristics,
248
+ args, system: SystemCharacteristics, parallel_containers: int
250
249
  ) -> str:
251
250
  """
252
251
  Create a user workload container for Pathways.
253
252
  Don't create one for Pathways headless mode.
254
253
 
255
- Args:
256
- args: user provided args.
257
- system: system characteristics.
258
-
259
-
260
254
  Returns:
261
255
  str:
262
256
  Pathways server port as a YAML string
@@ -280,7 +274,9 @@ def get_user_workload_for_pathways(
280
274
  if args.headless:
281
275
  return ''
282
276
  else:
283
- container, _ = get_user_workload_container(args, system)
277
+ container, _ = get_user_workload_container(
278
+ args, system, parallel_containers
279
+ )
284
280
  return user_workload_yaml.format(
285
281
  args=args,
286
282
  container=container,
xpk/core/scheduling.py CHANGED
@@ -88,6 +88,18 @@ def check_if_workload_can_schedule(
88
88
  return WorkloadScheduling.UNAVAILABLE
89
89
  return WorkloadScheduling.AVAILABLE
90
90
 
91
+ if cluster_system and _check_super_slicing_availability(
92
+ workload_system=workload_system, cluster_system=cluster_system
93
+ ):
94
+ if _check_workload_size_fits(
95
+ args,
96
+ workload_system,
97
+ max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
98
+ ) and _check_super_slicing_topology(workload_system):
99
+ return WorkloadScheduling.SUPER_SLICING_AVAILABLE
100
+ else:
101
+ return WorkloadScheduling.UNAVAILABLE
102
+
91
103
  if workload_system.device_type in resources_config_map:
92
104
  if _check_workload_size_fits(
93
105
  args,
@@ -112,18 +124,6 @@ def check_if_workload_can_schedule(
112
124
  else:
113
125
  return WorkloadScheduling.UNAVAILABLE
114
126
 
115
- if cluster_system and _check_super_slicing_availability(
116
- workload_system=workload_system, cluster_system=cluster_system
117
- ):
118
- if _check_workload_size_fits(
119
- args,
120
- workload_system,
121
- max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
122
- ) and _check_super_slicing_topology(workload_system):
123
- return WorkloadScheduling.SUPER_SLICING_AVAILABLE
124
- else:
125
- return WorkloadScheduling.UNAVAILABLE
126
-
127
127
  xpk_print(
128
128
  'Workload scheduling validation failed. XPK will not create the workload'
129
129
  f' {args.workload}.'
@@ -398,15 +398,23 @@ SUPER_SLICING_CASE = SchedulingTestCase(
398
398
  WorkloadScheduling.UNAVAILABLE,
399
399
  ),
400
400
  (
401
- (
402
- 'Super-slicing should be ignored when a given device is already'
403
- ' present in the cluster'
401
+ 'Super-slicing, but one cube',
402
+ dataclasses.replace(
403
+ SUPER_SLICING_CASE,
404
+ workload_system=_get_system_characteristics_or_die('tpu7x-128'),
405
+ cluster_system=_get_system_characteristics_or_die('tpu7x-128'),
406
+ resources_config_map={'tpu7x-128': '16'},
404
407
  ),
408
+ WorkloadScheduling.SUPER_SLICING_AVAILABLE,
409
+ ),
410
+ (
411
+ 'Super-slicing, but one cube and no super-slicing-topology',
405
412
  dataclasses.replace(
406
413
  SUPER_SLICING_CASE,
407
- workload_system=_get_system_characteristics_or_die('tpu7x-64'),
408
- cluster_system=_get_system_characteristics_or_die('tpu7x-64'),
409
- resources_config_map={'tpu7x-64': '16'},
414
+ workload_system=_get_system_characteristics_or_die('tpu7x-128'),
415
+ cluster_system=_get_system_characteristics_or_die('tpu7x-128'),
416
+ resources_config_map={'tpu7x-128': '16'},
417
+ super_slicing_topology_set=False,
410
418
  ),
411
419
  WorkloadScheduling.AVAILABLE,
412
420
  ),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 1.1.0
3
+ Version: 1.1.2
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -27,7 +27,7 @@ xpk/commands/managed_ml_diagnostics.py,sha256=87wmFbnYQY-kEpJfPo1Up53xM5P_P5wOlX
27
27
  xpk/commands/managed_ml_diagnostics_test.py,sha256=pQ1YUGMGRQFJYTS_1o9YyGUzYdLaBdA84LjbnncaeEo,3828
28
28
  xpk/commands/storage.py,sha256=cSTJN9Mjvdsvk_Nk43kVdQFhp89nxWbanDsTOGZCkpQ,10708
29
29
  xpk/commands/version.py,sha256=k30rdLP9clUM8eeSwRFhpfzSb1qwcQImTfuC59Ed6CA,771
30
- xpk/commands/workload.py,sha256=gDIzul8myTHG5J45LRjeIC-iSeNJ9ATE1j3DJyt4k4A,32172
30
+ xpk/commands/workload.py,sha256=Xhu_xNzGnKVfU3Piwf-rJbNO0r0LCjwslYjYlvOjD8Y,32347
31
31
  xpk/commands/workload_test.py,sha256=m79x6YDYn-36BX0CttTtAMdt_O-WJY40FLTGa6KwKg8,9804
32
32
  xpk/core/__init__.py,sha256=YPwWBbgLAu7L-YlTVGB2r8ZV4TzypURMRBcehSHHlLY,561
33
33
  xpk/core/capacity.py,sha256=MGiNOwBCwg8Ci-hsssbZYIJ2xXTm6Y5yKTO4J5ozqEk,11053
@@ -38,10 +38,10 @@ xpk/core/cluster_test.py,sha256=VeC1C7kN0OJe6yeoL8GCaFk4uPhijP6CjvQAcE7q9xw,6653
38
38
  xpk/core/commands.py,sha256=at73VJHdZ4rVA8uvW997tNrvnCjP9v6zaw96bU0kd74,10841
39
39
  xpk/core/config.py,sha256=U2JDXx-XBuqQpZJf2iUDoww5--E8ejZfgmIxKeGu-gU,4668
40
40
  xpk/core/config_test.py,sha256=POSuofK0LFbNNygDAo2fjtKY4NMrRjUFeGcpBh9JOS4,3569
41
- xpk/core/docker_container.py,sha256=Lsn6eJNN6dxvd7IbD0Ew4NnPKYM3VQyB8ursdG4jrIc,7919
41
+ xpk/core/docker_container.py,sha256=9kJpTEholW_d_GamjcqunCWT4XwrDyZs3fcvcPNCb8Y,8294
42
42
  xpk/core/docker_image.py,sha256=9vwqbb6Mc3C5ZEOph03WS-EWI5hxMYGGigqzIMkDTjE,6909
43
43
  xpk/core/docker_manager.py,sha256=vGPCWPDB507sxEsXvSD4IM-h5HqQzYLk7WSdCUmSDb4,10568
44
- xpk/core/docker_resources.py,sha256=7EXV1CvwCVogE5-m6utSE1GXxwf6EpB4QDYeuGXWHmI,12547
44
+ xpk/core/docker_resources.py,sha256=bwHGNh_gOtprVOeoFC8NObgKGD9aDjNc2XBMS6syD2Q,12562
45
45
  xpk/core/filestore.py,sha256=mcuUzsAPARbnrBG4fIGsEoN8NmzjaQ6k0tvIwMtjO9k,8068
46
46
  xpk/core/gcloud_context.py,sha256=d1wQ76zp7QMdG5BxB3sJz4b4OF5Mc8OzmPd_m0xd-Ys,6810
47
47
  xpk/core/gcloud_context_test.py,sha256=M8rp6S1zaEcAI7u4Bt8ukWKzv82HH5h9oYVojBcKgHk,5987
@@ -56,12 +56,12 @@ xpk/core/nap.py,sha256=gBxXu8Png1-BlAHbxLWZgbSXeLMGVixufkQVMR0fmvk,12963
56
56
  xpk/core/network.py,sha256=Oulb7U69lWkpOKxOC1C7ekJDpC51TLwd7XdZA3NQ7E0,10505
57
57
  xpk/core/nodepool.py,sha256=FX2ljKvwMsG3fXfn_CDCRwiKH4UAArQeDiFLq3XK9F0,25495
58
58
  xpk/core/nodepool_test.py,sha256=9xSFpn-1j9Vd0J8KFzbq8ywS_Ibsbx4CgR1er68mRnw,17542
59
- xpk/core/pathways.py,sha256=32GxCIPiEBqSpK6g2gMmB7Nxj_HlG4I30u1C9UyWl1A,11594
59
+ xpk/core/pathways.py,sha256=9w_VrpLLjQSSdNd8HJLWWtIYzA0NpR7t70knRSVLK0w,11574
60
60
  xpk/core/pathways_test.py,sha256=UeuSo_g9BNI27to-wflQwc6dJFVSA5-kOK_cjmY5qgU,1809
61
61
  xpk/core/ray.py,sha256=JWhc_ToRHpF4_URGnuE_47FMgamaRsA4KVUMpqThWzw,6145
62
62
  xpk/core/resources.py,sha256=dDsG_LOtcU17p1UKgOYyjdPxbMfqcb7pJ4SjfLDA6Os,9389
63
- xpk/core/scheduling.py,sha256=UWEN7cstbvc_9EfSTD1efZD59L5oh7riwNs9TLbvx00,12542
64
- xpk/core/scheduling_test.py,sha256=0QNiucR77tl72s5FOsp_8RKRp9CjjXSrrhAkTX9kMTg,15883
63
+ xpk/core/scheduling.py,sha256=J0yTpb4jBTQTFJ5QPyycFPFAKXC0fnmxeXRxZbvx8k8,12542
64
+ xpk/core/scheduling_test.py,sha256=zoGLoxNYLQGeQKtWOhBPP0bj4B0zXylRKhRIjO-TyTc,16280
65
65
  xpk/core/storage.py,sha256=NILvVAcLNMLmp4wKx_TEKbMMF5X1oL-FrQV46PT0_ds,16902
66
66
  xpk/core/system_characteristics.py,sha256=8WXi48mZ7eT9r57FZ5eFtmdonik7MItGTYiuYvcjXG8,34335
67
67
  xpk/core/system_characteristics_test.py,sha256=XVaKJ5wYdNwwwUKBnuK3zd1u-Qj3VnJR7MHlOeCa-K0,8029
@@ -142,9 +142,9 @@ xpk/utils/validation.py,sha256=rE9LTkXJT7jIesodFb9pONL7ixhLqiQleyoaz7N39Dw,2765
142
142
  xpk/utils/validation_test.py,sha256=PEDSMUqZdt_Lx1FSR-LOTXKKtsJ47JH1fxugM0Gfz6Y,1168
143
143
  xpk/utils/versions.py,sha256=_Ep68W70a9605XjiaOOpBa9Is9jXlsoOiwL8v5Xt-WA,897
144
144
  xpk/utils/yaml.py,sha256=j8xuAJ9yAAwnQi6ozwZ-nMnDyDnc3xWkeBZMtSuP4RU,844
145
- xpk-1.1.0.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
146
- xpk-1.1.0.dist-info/METADATA,sha256=b9BX5o6QPikxeZlBzNsCRNSVUpQm3jQs6KSuYhyz88o,10013
147
- xpk-1.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
148
- xpk-1.1.0.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
149
- xpk-1.1.0.dist-info/top_level.txt,sha256=aDe4N0jicmuWExx_6w0TxWQJaEuPSs9BnLU-3aF1GLo,4
150
- xpk-1.1.0.dist-info/RECORD,,
145
+ xpk-1.1.2.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
146
+ xpk-1.1.2.dist-info/METADATA,sha256=qKqUDQuylrwPZI7NNzHvJWj6kJE08pZ3SbxHNGZ9qgI,10013
147
+ xpk-1.1.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
148
+ xpk-1.1.2.dist-info/entry_points.txt,sha256=mzEtiIesFkT1kmcTUVDA1o3uOhiniX6tIz2wmOlMu1M,38
149
+ xpk-1.1.2.dist-info/top_level.txt,sha256=aDe4N0jicmuWExx_6w0TxWQJaEuPSs9BnLU-3aF1GLo,4
150
+ xpk-1.1.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5