xpk 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. xpk/commands/cluster.py +29 -30
  2. xpk/commands/cluster_gcluster.py +19 -14
  3. xpk/commands/cluster_test.py +1 -21
  4. xpk/commands/common.py +39 -6
  5. xpk/commands/common_test.py +170 -0
  6. xpk/commands/info.py +9 -5
  7. xpk/commands/inspector.py +33 -4
  8. xpk/commands/inspector_test.py +142 -0
  9. xpk/commands/workload.py +22 -8
  10. xpk/commands/workload_test.py +70 -3
  11. xpk/core/blueprint/blueprint_generator.py +19 -8
  12. xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
  13. xpk/core/blueprint/testing/data/a4.yaml +3 -1
  14. xpk/core/capacity.py +37 -17
  15. xpk/core/capacity_test.py +66 -1
  16. xpk/core/cluster.py +10 -10
  17. xpk/core/cluster_private.py +3 -3
  18. xpk/core/cluster_test.py +29 -2
  19. xpk/core/docker_container.py +31 -24
  20. xpk/core/docker_manager.py +4 -4
  21. xpk/core/docker_resources.py +4 -1
  22. xpk/core/kueue_manager.py +6 -8
  23. xpk/core/kueue_manager_test.py +4 -5
  24. xpk/core/nap.py +14 -3
  25. xpk/core/nodepool.py +46 -13
  26. xpk/core/nodepool_test.py +143 -8
  27. xpk/core/remote_state/fuse_remote_state.py +1 -1
  28. xpk/core/scheduling.py +4 -1
  29. xpk/core/scheduling_test.py +1 -1
  30. xpk/core/system_characteristics.py +6 -0
  31. xpk/core/telemetry.py +11 -1
  32. xpk/core/telemetry_test.py +39 -0
  33. xpk/core/testing/commands_tester.py +26 -0
  34. xpk/core/testing/commands_tester_test.py +20 -1
  35. xpk/core/workload_decorators/rdma_decorator.py +9 -0
  36. xpk/parser/cluster.py +11 -1
  37. xpk/parser/cluster_test.py +59 -1
  38. xpk/parser/common.py +11 -0
  39. xpk/parser/storage.py +3 -3
  40. xpk/utils/console.py +1 -1
  41. xpk/utils/feature_flags.py +7 -3
  42. {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/METADATA +37 -21
  43. {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/RECORD +47 -54
  44. xpk-1.1.0.dist-info/top_level.txt +1 -0
  45. integration/README.md +0 -19
  46. integration/__init__.py +0 -15
  47. integration/docker_manager_test.py +0 -102
  48. integration/gcluster_a3mega_test.py +0 -215
  49. integration/gcluster_a3ultra_test.py +0 -187
  50. integration/gcluster_a4_test.py +0 -187
  51. integration/gcluster_test.py +0 -107
  52. xpk/utils/user_input.py +0 -48
  53. xpk/utils/user_input_test.py +0 -92
  54. xpk-1.0.0.dist-info/top_level.txt +0 -2
  55. {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/WHEEL +0 -0
  56. {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/entry_points.txt +0 -0
  57. {xpk-1.0.0.dist-info → xpk-1.1.0.dist-info}/licenses/LICENSE +0 -0
xpk/core/scheduling.py CHANGED
@@ -33,8 +33,11 @@ from .system_characteristics import (
33
33
  from packaging.version import Version
34
34
 
35
35
  _SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
36
- _SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.14.0')
36
+ _SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.15.2')
37
37
  _SUPER_SLICING_MAX_TOPOLOGY = (16, 24, 24)
38
+ ONE_TO_ONE_REPLICA_NODE_POOL_ASSIGNMENT_ANNOTATION = (
39
+ 'alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool'
40
+ )
38
41
 
39
42
 
40
43
  class WorkloadScheduling(Enum):
@@ -208,7 +208,7 @@ SUPER_SLICING_CASE = SchedulingTestCase(
208
208
  cluster_system=_get_system_characteristics_or_die('tpu7x-4x4x4'),
209
209
  # 5 4x4x4 cubes:
210
210
  resources_config_map={'tpu7x-128': str(64 // 4 * 5)},
211
- kueue_version='0.14.0',
211
+ kueue_version='0.15.2',
212
212
  super_slicing_feature_enabled=True,
213
213
  super_slicing_topology_set=True,
214
214
  num_slices=1,
@@ -131,6 +131,8 @@ class SystemCharacteristics:
131
131
  supports_super_slicing: Whether the Super-slicing feature is supported.
132
132
  requires_workload_policy: A boolean indicating if a GCE resource
133
133
  workload policy is required. This is automatically set to True for GPUs.
134
+ parallel_containers: The number of containers running on a single VM.
135
+
134
136
  """
135
137
 
136
138
  topology: str
@@ -146,6 +148,7 @@ class SystemCharacteristics:
146
148
  docker_platform: DockerPlatform
147
149
  requires_workload_policy: bool = False
148
150
  gpu_config: Optional[GpuConfig] = None
151
+ parallel_containers: int = 1
149
152
 
150
153
  def __post_init__(self):
151
154
  if self.accelerator_type == AcceleratorType.GPU:
@@ -239,6 +242,7 @@ def get_tpu_system_characteristics_map(
239
242
  default_topologies: set[str] | None = None,
240
243
  sub_slicing_topologies: set[str] | None = None,
241
244
  super_slicing_topologies: set[str] | None = None,
245
+ parallel_containers: int = 1,
242
246
  ) -> dict[str, SystemCharacteristics]:
243
247
  system_characteristics_map = {}
244
248
  default_topologies = default_topologies or set()
@@ -263,6 +267,7 @@ def get_tpu_system_characteristics_map(
263
267
  supports_super_slicing=topology in super_slicing_topologies,
264
268
  supports_accelerator_network_profile=supports_accelerator_network_profile,
265
269
  docker_platform=docker_platform,
270
+ parallel_containers=parallel_containers,
266
271
  )
267
272
  system_characteristics_map[f'{prefix}-{topology}'] = system
268
273
  if (
@@ -544,6 +549,7 @@ UserFacingNameToSystemCharacteristics = {
544
549
  tpu_type_requires_workload_policy=True,
545
550
  supports_accelerator_network_profile=False,
546
551
  docker_platform=AMD_PLATFORM,
552
+ parallel_containers=2,
547
553
  supported_topologies=generate_tpu_topologies(max_cubes=144),
548
554
  super_slicing_topologies=set(['4x4x4']),
549
555
  default_topologies=set([
xpk/core/telemetry.py CHANGED
@@ -30,7 +30,7 @@ from dataclasses import dataclass
30
30
  from .config import get_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
31
31
  from ..utils.execution_context import is_dry_run
32
32
  from ..utils.user_agent import get_user_agent
33
- from ..utils.feature_flags import FeatureFlags
33
+ from ..utils.feature_flags import FeatureFlags, is_tester
34
34
 
35
35
 
36
36
  def should_send_telemetry():
@@ -114,6 +114,8 @@ def _clearcut_flush(file_path: str) -> None:
114
114
 
115
115
 
116
116
  class MetricsEventMetadataKey(Enum):
117
+ """Represents available metadata keys."""
118
+
117
119
  SESSION_ID = "XPK_SESSION_ID"
118
120
  DRY_RUN = "XPK_DRY_RUN"
119
121
  PYTHON_VERSION = "XPK_PYTHON_VERSION"
@@ -125,6 +127,7 @@ class MetricsEventMetadataKey(Enum):
125
127
  RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
126
128
  RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
127
129
  LATENCY_SECONDS = "XPK_LATENCY_SECONDS"
130
+ TESTER = "XPK_TESTER"
128
131
 
129
132
 
130
133
  @dataclass
@@ -230,6 +233,9 @@ def _get_base_event_metadata() -> dict[MetricsEventMetadataKey, str]:
230
233
  MetricsEventMetadataKey.RUNNING_FROM_SOURCE: str(
231
234
  _is_running_from_source()
232
235
  ).lower(),
236
+ MetricsEventMetadataKey.TESTER: str(
237
+ is_tester() or _is_trash_execution()
238
+ ).lower(),
233
239
  }
234
240
 
235
241
 
@@ -241,6 +247,10 @@ def _get_base_concord_event() -> dict[str, str]:
241
247
  }
242
248
 
243
249
 
250
+ def _is_trash_execution() -> bool:
251
+ return os.getenv("TELEMETRY_TRASH_EXECUTION") == "true"
252
+
253
+
244
254
  def _is_running_as_pip() -> bool:
245
255
  return os.path.basename(sys.argv[0]) == "xpk"
246
256
 
@@ -30,7 +30,9 @@ def setup_mocks(mocker: MockerFixture):
30
30
  mocker.patch('time.time', side_effect=itertools.count())
31
31
  mocker.patch('platform.python_version', return_value='99.99.99')
32
32
  mocker.patch('os.path.basename', return_value='xpk.py')
33
+ mocker.patch('os.getenv', return_value='false')
33
34
  mocker.patch('os.path.abspath', return_value='/home/xpk_user')
35
+ mocker.patch('xpk.core.telemetry.is_tester', return_value=False)
34
36
  set_dry_run(False)
35
37
  get_config().set(CLIENT_ID_KEY, 'client_id')
36
38
  yield
@@ -76,6 +78,7 @@ def test_metrics_collector_logs_start_event_correctly():
76
78
  {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
77
79
  {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
78
80
  {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
81
+ {'key': 'XPK_TESTER', 'value': 'false'},
79
82
  {'key': 'XPK_COMMAND', 'value': 'test'},
80
83
  {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
81
84
  ],
@@ -107,6 +110,7 @@ def test_metrics_collector_logs_complete_event_correctly():
107
110
  {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
108
111
  {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
109
112
  {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
113
+ {'key': 'XPK_TESTER', 'value': 'false'},
110
114
  {'key': 'XPK_EXIT_CODE', 'value': '2'},
111
115
  {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
112
116
  ],
@@ -131,6 +135,7 @@ def test_metrics_collector_logs_custom_event_correctly():
131
135
  {'key': 'XPK_PYTHON_VERSION', 'value': '99.99.99'},
132
136
  {'key': 'XPK_RUNNING_AS_PIP', 'value': 'false'},
133
137
  {'key': 'XPK_RUNNING_FROM_SOURCE', 'value': 'true'},
138
+ {'key': 'XPK_TESTER', 'value': 'false'},
134
139
  {'key': 'XPK_PROVISIONING_MODE', 'value': 'flex'},
135
140
  {'key': 'XPK_LATENCY_SECONDS', 'value': '0'},
136
141
  ],
@@ -219,6 +224,40 @@ def test_metrics_collectors_logs_correct_running_from_source_value(
219
224
  assert _get_metadata_value(payload, 'XPK_RUNNING_FROM_SOURCE') == expected
220
225
 
221
226
 
227
+ @pytest.mark.parametrize(
228
+ argnames='tester,expected',
229
+ argvalues=[
230
+ (True, 'true'),
231
+ (False, 'false'),
232
+ ],
233
+ )
234
+ def test_metrics_collectors_logs_correct_tester_value_for_is_tester_variable(
235
+ tester: bool, expected: str, mocker: MockerFixture
236
+ ):
237
+ mocker.patch('xpk.core.telemetry.is_tester', return_value=tester)
238
+ MetricsCollector.log_start(command='test')
239
+ payload = MetricsCollector.flush()
240
+ assert _get_metadata_value(payload, 'XPK_TESTER') == expected
241
+
242
+
243
+ @pytest.mark.parametrize(
244
+ argnames='trash_execution,expected',
245
+ argvalues=[
246
+ ('true', 'true'),
247
+ ('false', 'false'),
248
+ ('', 'false'),
249
+ (None, 'false'),
250
+ ],
251
+ )
252
+ def test_metrics_collectors_logs_correct_tester_value_for_trash_variable(
253
+ trash_execution: str, expected: str, mocker: MockerFixture
254
+ ):
255
+ mocker.patch('os.getenv', return_value=trash_execution)
256
+ MetricsCollector.log_start(command='test')
257
+ payload = MetricsCollector.flush()
258
+ assert _get_metadata_value(payload, 'XPK_TESTER') == expected
259
+
260
+
222
261
  def _get_metadata_value(payload_str: str, key: str) -> str | None:
223
262
  payload = json.loads(payload_str)
224
263
  metadata = json.loads(payload['log_event'][0]['source_extension_json'])[
@@ -17,6 +17,8 @@ limitations under the License.
17
17
  import re
18
18
  from pytest_mock import MockerFixture
19
19
 
20
+ from ..commands import FailedCommand
21
+
20
22
 
21
23
  class CommandsTester:
22
24
  """Tester class useful for mocking and asserting command runs."""
@@ -27,6 +29,7 @@ class CommandsTester:
27
29
  run_command_for_value_path: str | None = None,
28
30
  run_command_with_updates_path: str | None = None,
29
31
  run_command_with_updates_retry_path: str | None = None,
32
+ run_command_batch_path: str | None = None,
30
33
  ):
31
34
  self.__results: dict[re.Pattern, tuple[int, str]] = {}
32
35
  self.commands_history: list[str] = []
@@ -45,6 +48,11 @@ class CommandsTester:
45
48
  run_command_with_updates_retry_path,
46
49
  wraps=self.__fake_run_command_with_updates_retry,
47
50
  )
51
+ if run_command_batch_path:
52
+ mocker.patch(
53
+ run_command_batch_path,
54
+ wraps=self.__fake_run_command_batch,
55
+ )
48
56
 
49
57
  def set_result_for_command(
50
58
  self, result: tuple[int, str], *command_parts: str
@@ -111,6 +119,24 @@ class CommandsTester:
111
119
  ) -> tuple[int, str]:
112
120
  return self.__common_fake_run_command(command, (0, dry_run_return_val))
113
121
 
122
+ def __fake_run_command_batch(
123
+ self,
124
+ commands: list[str],
125
+ jobname: str,
126
+ per_command_name: list[str],
127
+ output_logs: list[str],
128
+ ) -> FailedCommand | None:
129
+ for i, command in enumerate(commands):
130
+ result = self.__common_fake_run_command(command, (0, ""))[0]
131
+ if result != 0:
132
+ return FailedCommand(
133
+ return_code=result,
134
+ name=per_command_name[i],
135
+ command=command,
136
+ logfile=output_logs[i],
137
+ )
138
+ return None
139
+
114
140
  # pylint: enable=unused-argument
115
141
 
116
142
  def __common_fake_run_command(
@@ -17,7 +17,7 @@ limitations under the License.
17
17
  import pytest
18
18
  from pytest_mock import MockerFixture
19
19
 
20
- from xpk.core.commands import run_command_for_value, run_command_with_updates_retry
20
+ from xpk.core.commands import run_command_for_value, run_command_with_updates_retry, run_command_batch
21
21
  from xpk.core.testing.commands_tester import CommandsTester
22
22
 
23
23
 
@@ -31,6 +31,9 @@ def mock_commands(mocker: MockerFixture) -> CommandsTester:
31
31
  run_command_with_updates_retry_path=(
32
32
  "xpk.core.testing.commands_tester_test.run_command_with_updates_retry"
33
33
  ),
34
+ run_command_batch_path=(
35
+ "xpk.core.testing.commands_tester_test.run_command_batch"
36
+ ),
34
37
  )
35
38
 
36
39
 
@@ -54,6 +57,22 @@ def test_run_command_with_updates_retry_default_result(
54
57
  mock_commands.assert_command_run("cmd", "bar")
55
58
 
56
59
 
60
+ def test_run_command_batch_default_result(
61
+ mock_commands: CommandsTester,
62
+ ):
63
+ result = run_command_batch(
64
+ commands=["cmd1 foo bar", "cmd2 foo bar"],
65
+ jobname="Test command",
66
+ per_command_name=["cmd1", "cmd2"],
67
+ output_logs=["log1", "log2"],
68
+ )
69
+
70
+ assert result is None
71
+ mock_commands.assert_command_run("foo bar", times=2)
72
+ mock_commands.assert_command_run("cmd1")
73
+ mock_commands.assert_command_run("cmd2")
74
+
75
+
57
76
  def test_set_result_for_command(mock_commands: CommandsTester):
58
77
  mock_commands.set_result_for_command((17, "Error!"), "cmd", "--err")
59
78
 
@@ -84,6 +84,12 @@ def add_volumes(job_manifest):
84
84
  volumes.append(
85
85
  {'name': 'gib', 'hostPath': {'path': '/home/kubernetes/bin/gib'}}
86
86
  )
87
+ volumes.append({
88
+ 'name': 'dshm',
89
+ 'emptyDir': {
90
+ 'medium': 'Memory',
91
+ },
92
+ })
87
93
 
88
94
 
89
95
  def add_tolerations(job_manifest):
@@ -111,3 +117,6 @@ def update_gpu_containers(job_manifest):
111
117
  container['volumeMounts'].append(
112
118
  {'name': 'gib', 'mountPath': '/usr/local/gib'}
113
119
  )
120
+ container['volumeMounts'].append(
121
+ {'name': 'dshm', 'mountPath': '/dev/shm'}
122
+ )
xpk/parser/cluster.py CHANGED
@@ -338,7 +338,10 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
338
338
  add_resource_limits(cluster_create_resource_limits)
339
339
 
340
340
  cluster_create_ray_parser.set_defaults(
341
- func=cluster_create_ray_cluster, sub_slicing=False, super_slicing=False
341
+ func=cluster_create_ray_cluster,
342
+ sub_slicing=False,
343
+ super_slicing=False,
344
+ num_cubes=None,
342
345
  )
343
346
 
344
347
 
@@ -503,6 +506,13 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
503
506
  )
504
507
  add_driver_arguments(cluster_adapt_optional_arguments)
505
508
  add_shared_arguments(cluster_adapt_optional_arguments)
509
+ add_resource_limits(cluster_adapt_optional_arguments)
510
+
511
+ if FeatureFlags.SUB_SLICING_ENABLED:
512
+ add_cluster_create_sub_slicing_arguments(cluster_adapt_optional_arguments)
513
+
514
+ if FeatureFlags.SUPER_SLICING_ENABLED:
515
+ add_cluster_create_super_slicing_arguments(cluster_adapt_optional_arguments)
506
516
 
507
517
  cluster_adapt_capacity_arguments = cluster_adapt_parser.add_argument_group(
508
518
  'Capacity Arguments', 'Arguments related to capacity for cluster create.'
@@ -15,8 +15,8 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  import argparse
18
- from xpk.parser.cluster import set_cluster_create_parser, set_cluster_create_pathways_parser, set_cluster_create_ray_parser
19
18
  import pytest
19
+ from xpk.parser.cluster import set_cluster_create_parser, set_cluster_create_pathways_parser, set_cluster_create_ray_parser, set_cluster_adapt_parser
20
20
  from ..utils.feature_flags import FeatureFlags
21
21
 
22
22
 
@@ -261,3 +261,61 @@ def test_cluster_create_num_slices_has_no_default_if_superslicing_feature():
261
261
  )
262
262
 
263
263
  assert args.num_slices is None
264
+
265
+
266
+ def test_cluster_adapt_sub_slicing_is_hidden_with_flag_off():
267
+ FeatureFlags.SUB_SLICING_ENABLED = False
268
+ parser = argparse.ArgumentParser()
269
+
270
+ set_cluster_adapt_parser(parser)
271
+ help_str = parser.format_help()
272
+
273
+ assert "--sub-slicing" not in help_str
274
+
275
+
276
+ def test_cluster_adapt_sub_slicing_is_shown_with_flag_on():
277
+ FeatureFlags.SUB_SLICING_ENABLED = True
278
+ parser = argparse.ArgumentParser()
279
+
280
+ set_cluster_adapt_parser(parser)
281
+ help_str = parser.format_help()
282
+
283
+ assert "--sub-slicing" in help_str
284
+
285
+
286
+ def test_cluster_adapt_super_slicing_is_hidden_with_flag_off():
287
+ FeatureFlags.SUPER_SLICING_ENABLED = False
288
+ parser = argparse.ArgumentParser()
289
+
290
+ set_cluster_adapt_parser(parser)
291
+ help_str = parser.format_help()
292
+
293
+ assert "--super-slicing" not in help_str
294
+
295
+
296
+ def test_cluster_adapt_super_slicing_is_shown_with_flag_on():
297
+ FeatureFlags.SUPER_SLICING_ENABLED = True
298
+ parser = argparse.ArgumentParser()
299
+
300
+ set_cluster_adapt_parser(parser)
301
+ help_str = parser.format_help()
302
+
303
+ assert "--super-slicing" in help_str
304
+
305
+
306
+ def test_cluster_adapt_memory_limit_is_shown():
307
+ parser = argparse.ArgumentParser()
308
+
309
+ set_cluster_adapt_parser(parser)
310
+ help_str = parser.format_help()
311
+
312
+ assert "--memory-limit" in help_str
313
+
314
+
315
+ def test_cluster_adapt_cpu_limit_is_shown():
316
+ parser = argparse.ArgumentParser()
317
+
318
+ set_cluster_adapt_parser(parser)
319
+ help_str = parser.format_help()
320
+
321
+ assert "--cpu-limit" in help_str
xpk/parser/common.py CHANGED
@@ -83,6 +83,17 @@ def add_shared_arguments(
83
83
  help='GCE project name, defaults to "gcloud config project."',
84
84
  required=required,
85
85
  )
86
+ custom_parser_or_group.add_argument(
87
+ '--project-number',
88
+ type=str,
89
+ default=None,
90
+ help=(
91
+ 'GCE project number. If provided, skips the Cloud Resource Manager'
92
+ ' API call to translate project ID to project number. Useful when'
93
+ ' the API is not enabled or you lack permissions.'
94
+ ),
95
+ required=False,
96
+ )
86
97
  custom_parser_or_group.add_argument(
87
98
  '--zone',
88
99
  type=str,
xpk/parser/storage.py CHANGED
@@ -127,7 +127,7 @@ def add_storage_attach_parser(
127
127
  type=str,
128
128
  help=(
129
129
  '(optional) Name of the bucket. If not set, then the "name" parameter'
130
- ' is infered as a bucket name.'
130
+ ' is inferred as a bucket name.'
131
131
  ),
132
132
  )
133
133
  gcsfuse_args.add_argument(
@@ -165,7 +165,7 @@ def add_storage_attach_parser(
165
165
  type=str,
166
166
  help=(
167
167
  '(optional) Name of the filestore instance. If not set, then the'
168
- ' "name" parameter is infered as an instance name.'
168
+ ' "name" parameter is inferred as an instance name.'
169
169
  ),
170
170
  )
171
171
 
@@ -238,7 +238,7 @@ def add_storage_create_parser(storage_subcommands_parser: Subcommands) -> None:
238
238
  type=str,
239
239
  help=(
240
240
  '(optional) Name of the filestore instance. If not set, then the'
241
- ' "name" parameter is infered as an instance name.'
241
+ ' "name" parameter is inferred as an instance name.'
242
242
  ),
243
243
  )
244
244
  opt_args.add_argument(
xpk/utils/console.py CHANGED
@@ -51,7 +51,7 @@ def ask_for_user_consent(
51
51
  question: str, default_option: Literal["Y", "N"] = "N"
52
52
  ) -> bool:
53
53
  """Prompts user with the given question, asking for a yes/no answer and returns a relevant boolean.
54
- Important: immediatelly returns `True` in quiet mode!
54
+ Important: immediately returns `True` in quiet mode!
55
55
 
56
56
  Example prompt for `question='Continue?'`: `[XPK] Continue? (y/N): `.
57
57
 
@@ -17,20 +17,24 @@ limitations under the License.
17
17
  import os
18
18
 
19
19
 
20
+ def is_tester() -> bool:
21
+ """Returns true if user is a tester."""
22
+ return os.getenv("XPK_TESTER", "").lower() == "true"
23
+
24
+
20
25
  def _get_boolean_flag(flag: str, default: bool) -> bool:
21
26
  experiment_value = os.getenv(flag, "").lower()
22
27
  if experiment_value in ["true", "false"]:
23
28
  return experiment_value == "true"
24
29
 
25
- xpk_tester = os.getenv("XPK_TESTER", "").lower() == "true"
26
- return xpk_tester or default
30
+ return is_tester() or default
27
31
 
28
32
 
29
33
  class _FeatureFlags:
30
34
  SUB_SLICING_ENABLED = _get_boolean_flag("SUB_SLICING_ENABLED", default=False)
31
35
  TELEMETRY_ENABLED = _get_boolean_flag("TELEMETRY_ENABLED", default=True)
32
36
  SUPER_SLICING_ENABLED = _get_boolean_flag(
33
- "SUPER_SLICING_ENABLED", default=False
37
+ "SUPER_SLICING_ENABLED", default=True
34
38
  )
35
39
 
36
40
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xpk
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Summary: xpk helps Cloud developers to orchestrate training jobs on accelerators on GKE.
5
5
  Author-email: XPK team <xpk-code-reviewers@google.com>
6
6
  License: Apache-2.0
@@ -93,28 +93,41 @@ XPK supports a variety of hardware accelerators.
93
93
 
94
94
  XPK also supports the following [Google Cloud Storage solutions](./docs/usage/storage.md):
95
95
 
96
- | Storage Type | Documentation |
97
- |--------------------------------------------|------------------------------------------------------------------------------------------|
98
- | Cloud Storage FUSE | [docs](./docs/usage/storage.md#fuse) |
99
- | Filestore | [docs](./docs/usage/storage.md#filestore) |
100
- | Parallelstore | [docs](./docs/usage/storage.md#parallelstore) |
101
- | Block storage (Persistent Disk, Hyperdisk) | [docs](./docs/usage/storage.md#block-storage-persistent-disk-hyperdisk) |
96
+ | Storage Type | Documentation |
97
+ | ------------------------------------------ | ----------------------------------------------------------------------- |
98
+ | Cloud Storage FUSE | [docs](./docs/usage/storage.md#fuse) |
99
+ | Filestore | [docs](./docs/usage/storage.md#filestore) |
100
+ | Parallelstore | [docs](./docs/usage/storage.md#parallelstore) |
101
+ | Block storage (Persistent Disk, Hyperdisk) | [docs](./docs/usage/storage.md#block-storage-persistent-disk-hyperdisk) |
102
102
 
103
103
  # Documentation
104
104
 
105
- * [Permissions](./docs/permissions.md)
106
- * [Installation](./docs/installation.md)
107
- * Usage:
108
- * [Clusters](./docs/usage/clusters.md)
109
- * [GPU](./docs/usage/gpu.md)
110
- * [CPU](./docs/usage/cpu.md)
111
- * [Autoprovisioning](./docs/usage/autoprovisioning.md)
112
- * [Workloads](./docs/usage/workloads.md)
113
- * [Docker](./docs/usage/docker.md)
114
- * [Storage](./docs/usage/storage.md)
115
- * [Advanced](./docs/usage/advanced.md)
116
- * [Inspector](./docs/usage/inspector.md)
117
- * [Troubleshooting](./docs/troubleshooting.md)
105
+ - [Permissions](./docs/permissions.md)
106
+ - [Installation](./docs/installation.md)
107
+ - Usage:
108
+ - [Clusters](./docs/usage/clusters.md)
109
+ - [GPU](./docs/usage/gpu.md)
110
+ - [CPU](./docs/usage/cpu.md)
111
+ - [Autoprovisioning](./docs/usage/autoprovisioning.md)
112
+ - [Workloads](./docs/usage/workloads.md)
113
+ - [Docker](./docs/usage/docker.md)
114
+ - [Storage](./docs/usage/storage.md)
115
+ - [Advanced](./docs/usage/advanced.md)
116
+ - [Inspector](./docs/usage/inspector.md)
117
+ - [Troubleshooting](./docs/troubleshooting.md)
118
+
119
+ # Dependencies
120
+
121
+ | Dependency | When used |
122
+ | ------------------------------------------------------------------------------------------------------------ | --------------------------- |
123
+ | [Google Cloud SDK (gcloud)](https://cloud.google.com/sdk/docs/install) | _always_ |
124
+ | [kubectl](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_kubectl) | _always_ |
125
+ | [ClusterToolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit) | Provisioning GPU clusters |
126
+ | [Kueue](https://github.com/kubernetes-sigs/kueue) | Scheduling workloads |
127
+ | [JobSet](https://github.com/kubernetes-sigs/jobset) | Workload creation |
128
+ | [Docker](https://docs.docker.com/engine/install/) | Building workload container |
129
+ | [CoreDNS](https://github.com/coredns/deployment/tree/master/kubernetes) | Cluster set up |
130
+ | [PathwaysJob](https://github.com/google/pathways-job) | Running Pathways workloads |
118
131
 
119
132
  # Privacy notice
120
133
 
@@ -129,11 +142,14 @@ XPK telemetry overall is handled in accordance with the [Google Privacy Policy](
129
142
  you use XPK to interact with or utilize GCP Services, your information is handled in accordance with the
130
143
  [Google Cloud Privacy Notice](https://cloud.google.com/terms/cloud-privacy-notice).
131
144
 
132
-
133
145
  # Contributing
134
146
 
135
147
  Please read [`contributing.md`](./docs/contributing.md) for details on our code of conduct, and the process for submitting pull requests to us.
136
148
 
149
+ # Get involved
150
+
151
+ We'd love to hear from you! If you have questions or want to discuss ideas, join us on [GitHub Discussions](https://github.com/AI-Hypercomputer/xpk/discussions). Found a bug or have a feature request? Please let us know on [GitHub Issues](https://github.com/AI-Hypercomputer/xpk/issues).
152
+
137
153
  # License
138
154
 
139
155
  This project is licensed under the Apache License 2.0 - see the [`LICENSE`](./LICENSE) file for details