xpk 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. xpk/commands/cluster.py +29 -30
  2. xpk/commands/cluster_gcluster.py +19 -14
  3. xpk/commands/cluster_test.py +1 -21
  4. xpk/commands/common.py +39 -6
  5. xpk/commands/common_test.py +170 -0
  6. xpk/commands/info.py +9 -5
  7. xpk/commands/inspector.py +33 -4
  8. xpk/commands/inspector_test.py +142 -0
  9. xpk/commands/workload.py +35 -17
  10. xpk/commands/workload_test.py +70 -3
  11. xpk/core/blueprint/blueprint_generator.py +19 -8
  12. xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
  13. xpk/core/blueprint/testing/data/a4.yaml +3 -1
  14. xpk/core/capacity.py +37 -17
  15. xpk/core/capacity_test.py +66 -1
  16. xpk/core/cluster.py +10 -10
  17. xpk/core/cluster_private.py +3 -3
  18. xpk/core/cluster_test.py +29 -2
  19. xpk/core/docker_container.py +55 -30
  20. xpk/core/docker_manager.py +4 -4
  21. xpk/core/docker_resources.py +4 -1
  22. xpk/core/kueue_manager.py +6 -8
  23. xpk/core/kueue_manager_test.py +4 -5
  24. xpk/core/nap.py +14 -3
  25. xpk/core/nodepool.py +46 -13
  26. xpk/core/nodepool_test.py +143 -8
  27. xpk/core/pathways.py +4 -8
  28. xpk/core/remote_state/fuse_remote_state.py +1 -1
  29. xpk/core/scheduling.py +16 -13
  30. xpk/core/scheduling_test.py +15 -7
  31. xpk/core/system_characteristics.py +6 -0
  32. xpk/core/telemetry.py +11 -1
  33. xpk/core/telemetry_test.py +39 -0
  34. xpk/core/testing/commands_tester.py +26 -0
  35. xpk/core/testing/commands_tester_test.py +20 -1
  36. xpk/core/workload_decorators/rdma_decorator.py +9 -0
  37. xpk/parser/cluster.py +11 -1
  38. xpk/parser/cluster_test.py +59 -1
  39. xpk/parser/common.py +11 -0
  40. xpk/parser/storage.py +3 -3
  41. xpk/utils/console.py +1 -1
  42. xpk/utils/feature_flags.py +7 -3
  43. {xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/METADATA +37 -21
  44. {xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/RECORD +48 -55
  45. xpk-1.1.1.dist-info/top_level.txt +1 -0
  46. integration/README.md +0 -19
  47. integration/__init__.py +0 -15
  48. integration/docker_manager_test.py +0 -102
  49. integration/gcluster_a3mega_test.py +0 -215
  50. integration/gcluster_a3ultra_test.py +0 -187
  51. integration/gcluster_a4_test.py +0 -187
  52. integration/gcluster_test.py +0 -107
  53. xpk/utils/user_input.py +0 -48
  54. xpk/utils/user_input_test.py +0 -92
  55. xpk-1.0.0.dist-info/top_level.txt +0 -2
  56. {xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/WHEEL +0 -0
  57. {xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/entry_points.txt +0 -0
  58. {xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/licenses/LICENSE +0 -0
xpk/core/nodepool_test.py CHANGED
@@ -20,6 +20,7 @@ from xpk.core.nodepool import (
20
20
  ensure_resource_policy_exists,
21
21
  get_desired_node_pool_names,
22
22
  run_gke_node_pool_create_command,
23
+ _validate_reservation_count,
23
24
  )
24
25
  from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
25
26
  from xpk.core.commands import FailedCommand
@@ -103,6 +104,7 @@ def commands_tester(mocker):
103
104
  return CommandsTester(
104
105
  mocker,
105
106
  run_command_for_value_path="xpk.core.nodepool.run_command_for_value",
107
+ run_command_batch_path="xpk.core.commands.run_command_batch",
106
108
  )
107
109
 
108
110
 
@@ -119,7 +121,7 @@ def test_ensure_resource_policy_exists_with_existing_policy_retrieves_existing_p
119
121
 
120
122
  assert len(commands_tester.commands_history) == 1
121
123
  commands_tester.assert_command_run(
122
- "gcloud compute resource-policies describe resource-policy",
124
+ "gcloud beta compute resource-policies describe resource-policy",
123
125
  "--project=test-project",
124
126
  "--region=us-central1",
125
127
  )
@@ -129,7 +131,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
129
131
  commands_tester: CommandsTester,
130
132
  ):
131
133
  commands_tester.set_result_for_command(
132
- (1, ""), "gcloud compute resource-policies describe"
134
+ (1, ""), "gcloud beta compute resource-policies describe"
133
135
  )
134
136
 
135
137
  ensure_resource_policy_exists(
@@ -142,16 +144,17 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
142
144
 
143
145
  assert len(commands_tester.commands_history) == 2
144
146
  commands_tester.assert_command_run(
145
- "gcloud compute resource-policies describe"
147
+ "gcloud beta compute resource-policies describe"
146
148
  )
147
149
  commands_tester.assert_command_run(
148
- "gcloud compute resource-policies create workload-policy resource-policy",
150
+ "gcloud beta compute resource-policies create workload-policy"
151
+ " resource-policy",
149
152
  "--project=test-project",
150
153
  "--region=us-central1",
151
154
  "--accelerator-topology=2x2x1",
152
155
  )
153
156
  commands_tester.assert_command_not_run(
154
- "gcloud compute resource-policies create workload-policy",
157
+ "gcloud beta compute resource-policies create workload-policy",
155
158
  "--accelerator-topology-mode",
156
159
  )
157
160
 
@@ -160,7 +163,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy_fo
160
163
  commands_tester: CommandsTester,
161
164
  ):
162
165
  commands_tester.set_result_for_command(
163
- (1, ""), "gcloud compute resource-policies describe"
166
+ (1, ""), "gcloud beta compute resource-policies describe"
164
167
  )
165
168
 
166
169
  ensure_resource_policy_exists(
@@ -172,7 +175,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy_fo
172
175
  )
173
176
 
174
177
  commands_tester.assert_command_run(
175
- "gcloud compute resource-policies create workload-policy",
178
+ "gcloud beta compute resource-policies create workload-policy",
176
179
  "--accelerator-topology-mode",
177
180
  )
178
181
 
@@ -182,7 +185,7 @@ def test_ensure_resource_policy_exits_without_existing_policy_throws_when_creati
182
185
  ):
183
186
  with pytest.raises(RuntimeError):
184
187
  commands_tester.set_result_for_command(
185
- (1, ""), "gcloud compute resource-policies"
188
+ (1, ""), "gcloud beta compute resource-policies"
186
189
  )
187
190
 
188
191
  ensure_resource_policy_exists(
@@ -433,3 +436,135 @@ def test_display_nodepool_creation_ignores_logs_without_errors(
433
436
  mock_xpk_print.call_args_list[0].args[0]
434
437
  == "Create Nodepools returned ERROR 1"
435
438
  )
439
+
440
+
441
+ def test_validate_reservation_count_mismatch(mock_xpk_print):
442
+ result = _validate_reservation_count(
443
+ ["res1", "res2"], num_node_pools_to_create=3
444
+ )
445
+
446
+ assert result == 1
447
+ assert mock_xpk_print.call_count == 1
448
+ assert (
449
+ "reservations (2) must match the number of NEW nodepools (3)"
450
+ in mock_xpk_print.call_args_list[0].args[0]
451
+ )
452
+
453
+
454
+ def test_run_gke_node_pool_create_command_multiple_reservations(
455
+ mocker,
456
+ commands_tester: CommandsTester,
457
+ ):
458
+ mocker.patch(
459
+ "xpk.core.nodepool.get_cluster_location", return_value="us-central1"
460
+ )
461
+ mocker.patch("xpk.core.capacity.verify_reservations_exist", return_value=0)
462
+ args = mocker.Mock(
463
+ num_slices=2,
464
+ reservation="res1,res2",
465
+ tpu_type="v4-8",
466
+ device_type=None,
467
+ cluster="test-cluster",
468
+ project="test-project",
469
+ zone="us-central1-a",
470
+ on_demand=False,
471
+ spot=False,
472
+ flex=False,
473
+ enable_workload_identity=False,
474
+ enable_gcsfuse_csi_driver=False,
475
+ host_maintenance_interval="AS_NEEDED",
476
+ custom_nodepool_arguments="",
477
+ )
478
+ system = SystemCharacteristics(
479
+ topology="2x2x1",
480
+ vms_per_slice=2,
481
+ gke_accelerator="tpu-v4",
482
+ gce_machine_type="ct4p-hightpu-4t",
483
+ chips_per_vm=4,
484
+ accelerator_type=AcceleratorType.TPU,
485
+ device_type="v4-8",
486
+ requires_workload_policy=False,
487
+ supports_sub_slicing=False,
488
+ supports_super_slicing=False,
489
+ supports_accelerator_network_profile=True,
490
+ docker_platform=DockerPlatform.AMD,
491
+ )
492
+ commands_tester.set_result_for_command(
493
+ (0, ""), "gcloud beta container node-pools list"
494
+ )
495
+
496
+ result = run_gke_node_pool_create_command(args, system, "1.2.3")
497
+
498
+ assert result == 0
499
+ commands_tester.assert_command_run(
500
+ "gcloud", "node-pools create", "--tpu-topology=2x2x1", times=2
501
+ )
502
+ commands_tester.assert_command_run(
503
+ "gcloud", "node-pools create", "test-cluster-np-0", "--reservation=res1"
504
+ )
505
+ commands_tester.assert_command_run(
506
+ "gcloud", "node-pools create", "test-cluster-np-1", "--reservation=res2"
507
+ )
508
+
509
+
510
+ def test_run_gke_node_pool_create_command_partial_reservations(
511
+ mocker,
512
+ commands_tester: CommandsTester,
513
+ ):
514
+ mocker.patch(
515
+ "xpk.core.nodepool.get_cluster_location", return_value="us-central1"
516
+ )
517
+ mocker.patch("xpk.core.nodepool.get_node_pools_to_delete", return_value=[])
518
+ mocker.patch("xpk.core.capacity.verify_reservations_exist", return_value=0)
519
+ args = mocker.Mock(
520
+ num_slices=3,
521
+ reservation="res1,res2",
522
+ tpu_type="v4-8",
523
+ device_type=None,
524
+ cluster="test-cluster",
525
+ project="test-project",
526
+ zone="us-central1-a",
527
+ on_demand=False,
528
+ spot=False,
529
+ flex=False,
530
+ enable_workload_identity=False,
531
+ enable_gcsfuse_csi_driver=False,
532
+ host_maintenance_interval="AS_NEEDED",
533
+ custom_nodepool_arguments="",
534
+ )
535
+ system = SystemCharacteristics(
536
+ topology="2x2x1",
537
+ vms_per_slice=2,
538
+ gke_accelerator="tpu-v4",
539
+ gce_machine_type="ct4p-hightpu-4t",
540
+ chips_per_vm=4,
541
+ accelerator_type=AcceleratorType.TPU,
542
+ device_type="v4-8",
543
+ requires_workload_policy=False,
544
+ supports_sub_slicing=False,
545
+ supports_super_slicing=False,
546
+ supports_accelerator_network_profile=True,
547
+ docker_platform=DockerPlatform.AMD,
548
+ )
549
+ commands_tester.set_result_for_command(
550
+ (0, "test-cluster-np-0"), "gcloud beta container node-pools list"
551
+ )
552
+ commands_tester.set_result_for_command(
553
+ (0, "us-central1-a"),
554
+ "gcloud",
555
+ "node-pools describe",
556
+ '--format="value(locations)"',
557
+ )
558
+
559
+ result = run_gke_node_pool_create_command(args, system, "1.2.3")
560
+
561
+ assert result == 0
562
+ commands_tester.assert_command_run(
563
+ "gcloud", "node-pools create", "--tpu-topology=2x2x1", times=2
564
+ )
565
+ commands_tester.assert_command_run(
566
+ "gcloud", "node-pools create", "test-cluster-np-1", "--reservation=res1"
567
+ )
568
+ commands_tester.assert_command_run(
569
+ "gcloud", "node-pools create", "test-cluster-np-2", "--reservation=res2"
570
+ )
xpk/core/pathways.py CHANGED
@@ -245,18 +245,12 @@ def append_custom_colocated_python_sidecar(args) -> str:
245
245
 
246
246
 
247
247
  def get_user_workload_for_pathways(
248
- args,
249
- system: SystemCharacteristics,
248
+ args, system: SystemCharacteristics, parallel_containers: int
250
249
  ) -> str:
251
250
  """
252
251
  Create a user workload container for Pathways.
253
252
  Don't create one for Pathways headless mode.
254
253
 
255
- Args:
256
- args: user provided args.
257
- system: system characteristics.
258
-
259
-
260
254
  Returns:
261
255
  str:
262
256
  Pathways server port as a YAML string
@@ -280,7 +274,9 @@ def get_user_workload_for_pathways(
280
274
  if args.headless:
281
275
  return ''
282
276
  else:
283
- container, _ = get_user_workload_container(args, system)
277
+ container, _ = get_user_workload_container(
278
+ args, system, parallel_containers
279
+ )
284
280
  return user_workload_yaml.format(
285
281
  args=args,
286
282
  container=container,
@@ -56,7 +56,7 @@ class FuseStateClient(RemoteStateClient):
56
56
 
57
57
  def upload_state(self) -> None:
58
58
  xpk_print(
59
- f'Uploading dependecies from directory {self.state_dir} to bucket:'
59
+ f'Uploading dependencies from directory {self.state_dir} to bucket:'
60
60
  f' {self.bucket}. Path within bucket is: {self._get_bucket_path()}'
61
61
  )
62
62
  upload_directory_to_gcs(
xpk/core/scheduling.py CHANGED
@@ -33,8 +33,11 @@ from .system_characteristics import (
33
33
  from packaging.version import Version
34
34
 
35
35
  _SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
36
- _SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.14.0')
36
+ _SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.15.2')
37
37
  _SUPER_SLICING_MAX_TOPOLOGY = (16, 24, 24)
38
+ ONE_TO_ONE_REPLICA_NODE_POOL_ASSIGNMENT_ANNOTATION = (
39
+ 'alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool'
40
+ )
38
41
 
39
42
 
40
43
  class WorkloadScheduling(Enum):
@@ -85,6 +88,18 @@ def check_if_workload_can_schedule(
85
88
  return WorkloadScheduling.UNAVAILABLE
86
89
  return WorkloadScheduling.AVAILABLE
87
90
 
91
+ if cluster_system and _check_super_slicing_availability(
92
+ workload_system=workload_system, cluster_system=cluster_system
93
+ ):
94
+ if _check_workload_size_fits(
95
+ args,
96
+ workload_system,
97
+ max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
98
+ ) and _check_super_slicing_topology(workload_system):
99
+ return WorkloadScheduling.SUPER_SLICING_AVAILABLE
100
+ else:
101
+ return WorkloadScheduling.UNAVAILABLE
102
+
88
103
  if workload_system.device_type in resources_config_map:
89
104
  if _check_workload_size_fits(
90
105
  args,
@@ -109,18 +124,6 @@ def check_if_workload_can_schedule(
109
124
  else:
110
125
  return WorkloadScheduling.UNAVAILABLE
111
126
 
112
- if cluster_system and _check_super_slicing_availability(
113
- workload_system=workload_system, cluster_system=cluster_system
114
- ):
115
- if _check_workload_size_fits(
116
- args,
117
- workload_system,
118
- max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
119
- ) and _check_super_slicing_topology(workload_system):
120
- return WorkloadScheduling.SUPER_SLICING_AVAILABLE
121
- else:
122
- return WorkloadScheduling.UNAVAILABLE
123
-
124
127
  xpk_print(
125
128
  'Workload scheduling validation failed. XPK will not create the workload'
126
129
  f' {args.workload}.'
@@ -208,7 +208,7 @@ SUPER_SLICING_CASE = SchedulingTestCase(
208
208
  cluster_system=_get_system_characteristics_or_die('tpu7x-4x4x4'),
209
209
  # 5 4x4x4 cubes:
210
210
  resources_config_map={'tpu7x-128': str(64 // 4 * 5)},
211
- kueue_version='0.14.0',
211
+ kueue_version='0.15.2',
212
212
  super_slicing_feature_enabled=True,
213
213
  super_slicing_topology_set=True,
214
214
  num_slices=1,
@@ -398,15 +398,23 @@ SUPER_SLICING_CASE = SchedulingTestCase(
398
398
  WorkloadScheduling.UNAVAILABLE,
399
399
  ),
400
400
  (
401
- (
402
- 'Super-slicing should be ignored when a given device is already'
403
- ' present in the cluster'
401
+ 'Super-slicing, but one cube',
402
+ dataclasses.replace(
403
+ SUPER_SLICING_CASE,
404
+ workload_system=_get_system_characteristics_or_die('tpu7x-128'),
405
+ cluster_system=_get_system_characteristics_or_die('tpu7x-128'),
406
+ resources_config_map={'tpu7x-128': '16'},
404
407
  ),
408
+ WorkloadScheduling.SUPER_SLICING_AVAILABLE,
409
+ ),
410
+ (
411
+ 'Super-slicing, but one cube and no super-slicing-topology',
405
412
  dataclasses.replace(
406
413
  SUPER_SLICING_CASE,
407
- workload_system=_get_system_characteristics_or_die('tpu7x-64'),
408
- cluster_system=_get_system_characteristics_or_die('tpu7x-64'),
409
- resources_config_map={'tpu7x-64': '16'},
414
+ workload_system=_get_system_characteristics_or_die('tpu7x-128'),
415
+ cluster_system=_get_system_characteristics_or_die('tpu7x-128'),
416
+ resources_config_map={'tpu7x-128': '16'},
417
+ super_slicing_topology_set=False,
410
418
  ),
411
419
  WorkloadScheduling.AVAILABLE,
412
420
  ),
@@ -131,6 +131,8 @@ class SystemCharacteristics:
131
131
  supports_super_slicing: Whether the Super-slicing feature is supported.
132
132
  requires_workload_policy: A boolean indicating if a GCE resource
133
133
  workload policy is required. This is automatically set to True for GPUs.
134
+ parallel_containers: The number of containers running on a single VM.
135
+
134
136
  """
135
137
 
136
138
  topology: str
@@ -146,6 +148,7 @@ class SystemCharacteristics:
146
148
  docker_platform: DockerPlatform
147
149
  requires_workload_policy: bool = False
148
150
  gpu_config: Optional[GpuConfig] = None
151
+ parallel_containers: int = 1
149
152
 
150
153
  def __post_init__(self):
151
154
  if self.accelerator_type == AcceleratorType.GPU:
@@ -239,6 +242,7 @@ def get_tpu_system_characteristics_map(
239
242
  default_topologies: set[str] | None = None,
240
243
  sub_slicing_topologies: set[str] | None = None,
241
244
  super_slicing_topologies: set[str] | None = None,
245
+ parallel_containers: int = 1,
242
246
  ) -> dict[str, SystemCharacteristics]:
243
247
  system_characteristics_map = {}
244
248
  default_topologies = default_topologies or set()
@@ -263,6 +267,7 @@ def get_tpu_system_characteristics_map(
263
267
  supports_super_slicing=topology in super_slicing_topologies,
264
268
  supports_accelerator_network_profile=supports_accelerator_network_profile,
265
269
  docker_platform=docker_platform,
270
+ parallel_containers=parallel_containers,
266
271
  )
267
272
  system_characteristics_map[f'{prefix}-{topology}'] = system
268
273
  if (
@@ -544,6 +549,7 @@ UserFacingNameToSystemCharacteristics = {
544
549
  tpu_type_requires_workload_policy=True,
545
550
  supports_accelerator_network_profile=False,
546
551
  docker_platform=AMD_PLATFORM,
552
+ parallel_containers=2,
547
553
  supported_topologies=generate_tpu_topologies(max_cubes=144),
548
554
  super_slicing_topologies=set(['4x4x4']),
549
555
  default_topologies=set([
xpk/core/telemetry.py CHANGED
@@ -30,7 +30,7 @@ from dataclasses import dataclass
30
30
  from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
31
31
  from ..utils.execution_context import is_dry_run
32
32
  from ..utils.user_agent import get_user_agent
33
- from ..utils.feature_flags import FeatureFlags
33
+ from ..utils.feature_flags import FeatureFlags, is_tester
34
34
 
35
35
 
36
36
  def should_send_telemetry():
@@ -114,6 +114,8 @@ def _clearcut_flush(file_path: str) -> None:
114
114
 
115
115
 
116
116
  class MetricsEventMetadataKey(Enum):
117
+ """Represents available metadata keys."""
118
+
117
119
  SESSION_ID = "XPK_SESSION_ID"
118
120
  DRY_RUN = "XPK_DRY_RUN"
119
121
  PYTHON_VERSION = "XPK_PYTHON_VERSION"
@@ -125,6 +127,7 @@ class MetricsEventMetadataKey(Enum):
125
127
  RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
126
128
  RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
127
129
  LATENCY_SECONDS = "XPK_LATENCY_SECONDS"
130
+ TESTER = "XPK_TESTER"
128
131
 
129
132
 
130
133
  @dataclass
@@ -230,6 +233,9 @@ def _get_base_event_metadata() -> dict[MetricsEventMetadataKey, str]:
230
233
  MetricsEventMetadataKey.RUNNING_FROM_SOURCE: str(
231
234
  _is_running_from_source()
232
235
  ).lower(),
236
+ MetricsEventMetadataKey.TESTER: str(
237
+ is_tester() or _is_trash_execution()
238
+ ).lower(),
233
239
  }
234
240
 
235
241
 
@@ -241,6 +247,10 @@ def _get_base_concord_event() -> dict[str, str]:
241
247
  }
242
248
 
243
249
 
250
+ def _is_trash_execution() -> bool:
251
+ return os.getenv("TELEMETRY_TRASH_EXECUTION") == "true"
252
+
253
+
244
254
  def _is_running_as_pip() -> bool:
245
255
  return os.path.basename(sys.argv[0]) == "xpk"
246
256
 
@@ -30,7 +30,9 @@ def setup_mocks(mocker: MockerFixture):
30
30
  mocker.patch('time.time', side_effect=itertools.count())
31
31
  mocker.patch('platform.python_version', return_value='99.99.99')
32
32
  mocker.patch('os.path.basename', return_value='xpk.py')
33
+ mocker.patch('os.getenv', return_value='false')
33
34
  mocker.patch('os.path.abspath', return_value='/home/xpk_user')
35
+ mocker.patch('xpk.core.telemetry.is_tester', return_value=False)
34
36
  set_dry_run(False)
35
37
  get_config().set(CLIENT_ID_KEY, 'client_id')
36
38
  yield
@@ -76,6 +78,7 @@ def test_metrics_collector_logs_start_event_correctly():
76
78
  {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
77
79
  {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
78
80
  {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
81
+ {'key': 'XPK_TESTER', 'value': 'false'},
79
82
  {'key': 'XPK_COMMAND', 'value': 'test'},
80
83
  {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
81
84
  ],
@@ -107,6 +110,7 @@ def test_metrics_collector_logs_complete_event_correctly():
107
110
  {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
108
111
  {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
109
112
  {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
113
+ {'key': 'XPK_TESTER', 'value': 'false'},
110
114
  {'key': 'XPK_EXIT_CODE', 'value': '2'},
111
115
  {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
112
116
  ],
@@ -131,6 +135,7 @@ def test_metrics_collector_logs_custom_event_correctly():
131
135
  {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
132
136
  {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
133
137
  {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
138
+ {'key': 'XPK_TESTER', 'value': 'false'},
134
139
  {'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
135
140
  {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
136
141
  ],
@@ -219,6 +224,40 @@ def test_metrics_collectors_logs_correct_running_from_source_value(
219
224
  assert _get_metadata_value(payload, 'XPK_RUNNING_FROM_SOURCE') == expected
220
225
 
221
226
 
227
+ @pytest.mark.parametrize(
228
+ argnames='tester,expected',
229
+ argvalues=[
230
+ (True, 'true'),
231
+ (False, 'false'),
232
+ ],
233
+ )
234
+ def test_metrics_collectors_logs_correct_tester_value_for_is_tester_variable(
235
+ tester: bool, expected: str, mocker: MockerFixture
236
+ ):
237
+ mocker.patch('xpk.core.telemetry.is_tester', return_value=tester)
238
+ MetricsCollector.log_start(command='test')
239
+ payload = MetricsCollector.flush()
240
+ assert _get_metadata_value(payload, 'XPK_TESTER') == expected
241
+
242
+
243
+ @pytest.mark.parametrize(
244
+ argnames='trash_execution,expected',
245
+ argvalues=[
246
+ ('true', 'true'),
247
+ ('false', 'false'),
248
+ ('', 'false'),
249
+ (None, 'false'),
250
+ ],
251
+ )
252
+ def test_metrics_collectors_logs_correct_tester_value_for_trash_variable(
253
+ trash_execution: str, expected: str, mocker: MockerFixture
254
+ ):
255
+ mocker.patch('os.getenv', return_value=trash_execution)
256
+ MetricsCollector.log_start(command='test')
257
+ payload = MetricsCollector.flush()
258
+ assert _get_metadata_value(payload, 'XPK_TESTER') == expected
259
+
260
+
222
261
  def _get_metadata_value(payload_str: str, key: str) -> str | None:
223
262
  payload = json.loads(payload_str)
224
263
  metadata = json.loads(payload['log_event'][0]['source_extension_json'])[
@@ -17,6 +17,8 @@ limitations under the License.
17
17
  import re
18
18
  from pytest_mock import MockerFixture
19
19
 
20
+ from ..commands import FailedCommand
21
+
20
22
 
21
23
  class CommandsTester:
22
24
  """Tester class useful for mocking and asserting command runs."""
@@ -27,6 +29,7 @@ class CommandsTester:
27
29
  run_command_for_value_path: str | None = None,
28
30
  run_command_with_updates_path: str | None = None,
29
31
  run_command_with_updates_retry_path: str | None = None,
32
+ run_command_batch_path: str | None = None,
30
33
  ):
31
34
  self.__results: dict[re.Pattern, tuple[int, str]] = {}
32
35
  self.commands_history: list[str] = []
@@ -45,6 +48,11 @@ class CommandsTester:
45
48
  run_command_with_updates_retry_path,
46
49
  wraps=self.__fake_run_command_with_updates_retry,
47
50
  )
51
+ if run_command_batch_path:
52
+ mocker.patch(
53
+ run_command_batch_path,
54
+ wraps=self.__fake_run_command_batch,
55
+ )
48
56
 
49
57
  def set_result_for_command(
50
58
  self, result: tuple[int, str], *command_parts: str
@@ -111,6 +119,24 @@ class CommandsTester:
111
119
  ) -> tuple[int, str]:
112
120
  return self.__common_fake_run_command(command, (0, dry_run_return_val))
113
121
 
122
+ def __fake_run_command_batch(
123
+ self,
124
+ commands: list[str],
125
+ jobname: str,
126
+ per_command_name: list[str],
127
+ output_logs: list[str],
128
+ ) -> FailedCommand | None:
129
+ for i, command in enumerate(commands):
130
+ result = self.__common_fake_run_command(command, (0, ""))[0]
131
+ if result != 0:
132
+ return FailedCommand(
133
+ return_code=result,
134
+ name=per_command_name[i],
135
+ command=command,
136
+ logfile=output_logs[i],
137
+ )
138
+ return None
139
+
114
140
  # pylint: enable=unused-argument
115
141
 
116
142
  def __common_fake_run_command(
@@ -17,7 +17,7 @@ limitations under the License.
17
17
  import pytest
18
18
  from pytest_mock import MockerFixture
19
19
 
20
- from xpk.core.commands import run_command_for_value, run_command_with_updates_retry
20
+ from xpk.core.commands import run_command_for_value, run_command_with_updates_retry, run_command_batch
21
21
  from xpk.core.testing.commands_tester import CommandsTester
22
22
 
23
23
 
@@ -31,6 +31,9 @@ def mock_commands(mocker: MockerFixture) -> CommandsTester:
31
31
  run_command_with_updates_retry_path=(
32
32
  "xpk.core.testing.commands_tester_test.run_command_with_updates_retry"
33
33
  ),
34
+ run_command_batch_path=(
35
+ "xpk.core.testing.commands_tester_test.run_command_batch"
36
+ ),
34
37
  )
35
38
 
36
39
 
@@ -54,6 +57,22 @@ def test_run_command_with_updates_retry_default_result(
54
57
  mock_commands.assert_command_run("cmd", "bar")
55
58
 
56
59
 
60
+ def test_run_command_batch_default_result(
61
+ mock_commands: CommandsTester,
62
+ ):
63
+ result = run_command_batch(
64
+ commands=["cmd1 foo bar", "cmd2 foo bar"],
65
+ jobname="Test command",
66
+ per_command_name=["cmd1", "cmd2"],
67
+ output_logs=["log1", "log2"],
68
+ )
69
+
70
+ assert result is None
71
+ mock_commands.assert_command_run("foo bar", times=2)
72
+ mock_commands.assert_command_run("cmd1")
73
+ mock_commands.assert_command_run("cmd2")
74
+
75
+
57
76
  def test_set_result_for_command(mock_commands: CommandsTester):
58
77
  mock_commands.set_result_for_command((17, "Error!"), "cmd", "--err")
59
78
 
@@ -84,6 +84,12 @@ def add_volumes(job_manifest):
84
84
  volumes.append(
85
85
  {'name': 'gib', 'hostPath': {'path': '/home/kubernetes/bin/gib'}}
86
86
  )
87
+ volumes.append({
88
+ 'name': 'dshm',
89
+ 'emptyDir': {
90
+ 'medium': 'Memory',
91
+ },
92
+ })
87
93
 
88
94
 
89
95
  def add_tolerations(job_manifest):
@@ -111,3 +117,6 @@ def update_gpu_containers(job_manifest):
111
117
  container['volumeMounts'].append(
112
118
  {'name': 'gib', 'mountPath': '/usr/local/gib'}
113
119
  )
120
+ container['volumeMounts'].append(
121
+ {'name': 'dshm', 'mountPath': '/dev/shm'}
122
+ )
xpk/parser/cluster.py CHANGED
@@ -338,7 +338,10 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
338
338
  add_resource_limits(cluster_create_resource_limits)
339
339
 
340
340
  cluster_create_ray_parser.set_defaults(
341
- func=cluster_create_ray_cluster, sub_slicing=False, super_slicing=False
341
+ func=cluster_create_ray_cluster,
342
+ sub_slicing=False,
343
+ super_slicing=False,
344
+ num_cubes=None,
342
345
  )
343
346
 
344
347
 
@@ -503,6 +506,13 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
503
506
  )
504
507
  add_driver_arguments(cluster_adapt_optional_arguments)
505
508
  add_shared_arguments(cluster_adapt_optional_arguments)
509
+ add_resource_limits(cluster_adapt_optional_arguments)
510
+
511
+ if FeatureFlags.SUB_SLICING_ENABLED:
512
+ add_cluster_create_sub_slicing_arguments(cluster_adapt_optional_arguments)
513
+
514
+ if FeatureFlags.SUPER_SLICING_ENABLED:
515
+ add_cluster_create_super_slicing_arguments(cluster_adapt_optional_arguments)
506
516
 
507
517
  cluster_adapt_capacity_arguments = cluster_adapt_parser.add_argument_group(
508
518
  'Capacity Arguments', 'Arguments related to capacity for cluster create.'