xpk 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xpk/core/config.py CHANGED
@@ -15,16 +15,14 @@ limitations under the License.
15
15
  """
16
16
 
17
17
  import os
18
- import re
19
18
 
20
19
  import ruamel.yaml
21
20
 
22
21
  from ..utils import file
23
22
  from ..utils.console import xpk_print
24
- from .system_characteristics import AcceleratorType, SystemCharacteristics
25
23
 
26
24
  # This is the version for XPK PyPI package
27
- __version__ = 'v0.9.0'
25
+ __version__ = 'v0.10.1'
28
26
  XPK_CURRENT_VERSION = __version__
29
27
  XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
30
28
 
@@ -117,65 +115,3 @@ class XpkConfig:
117
115
  return None
118
116
  val: dict[str, str] = config_yaml[CONFIGS_KEY]
119
117
  return val
120
-
121
-
122
- def parse_env_config(args, tensorboard_config, system: SystemCharacteristics):
123
- """Parses the environment configurations to the jobset config.
124
-
125
- Args:
126
- args: user provided arguments for running the command.
127
- tensorboard_config: configuration of Vertex Tensorboard.
128
- system: system characteristics.
129
- """
130
- env = {}
131
-
132
- env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
133
- if args.env_file:
134
- print('Setting container environment from', args.env_file)
135
- with open(file=args.env_file, mode='r', encoding='utf-8') as f:
136
- for match in env_pat.finditer(f.read()):
137
- variable = match.group(1)
138
- if match.group(2) is not None:
139
- env[variable] = match.group(2)
140
- else:
141
- assert variable in os.environ, (
142
- f'Variable {variable} is not set in the current '
143
- 'environment, a value must be specified.'
144
- )
145
- env[variable] = os.environ[variable]
146
- if args.env:
147
- for var in args.env:
148
- match = env_pat.match(var)
149
- assert match and match.group(2) is not None, (
150
- 'Invalid environment variable, format must be '
151
- f'`--env VARIABLE=value`: {var}'
152
- )
153
- variable = match.group(1)
154
- env[variable] = match.group(2)
155
-
156
- if not args.use_pathways:
157
- if args.debug_dump_gcs:
158
- if 'XLA_FLAGS' in env:
159
- raise ValueError(
160
- 'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
161
- 'and environment file. Please choose one way to define '
162
- 'XLA_FLAGS.'
163
- )
164
- env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
165
-
166
- if tensorboard_config:
167
- env['UPLOAD_DATA_TO_TENSORBOARD'] = True
168
- for key, value in tensorboard_config.items():
169
- env[key.upper()] = value
170
-
171
- if system.accelerator_type == AcceleratorType['GPU']:
172
- # For GPUs, it has two more spaces ahead of name and value respectively
173
- env_format = '''
174
- - name: {key}
175
- value: "{value}"'''
176
- else:
177
- env_format = '''
178
- - name: {key}
179
- value: "{value}"'''
180
-
181
- args.env = ''.join(env_format.format(key=k, value=v) for k, v in env.items())
@@ -30,7 +30,7 @@ import time
30
30
  DockerRunCommandExitCode = 135
31
31
  dockerBuildErrorCode = 134
32
32
  ctk_dockerfile_path = "Dockerfile"
33
- ctk_build_ref = "v1.48.0"
33
+ ctk_build_ref = "v1.57.1"
34
34
  ctk_docker_image = "xpk-ctk"
35
35
  ctk_container_name = "xpk-ctk-container"
36
36
  gcloud_cfg_mount_path = "/root/.config/gcloud"
@@ -14,9 +14,11 @@ See the License for the specific language governing permissions and
14
14
  limitations under the License.
15
15
  """
16
16
 
17
- from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
17
+ import os
18
+ import re
19
+ from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
18
20
  from .cluster import setup_k8s_env
19
- from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, Storage, get_storages_to_mount
21
+ from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE, Storage, get_storages_to_mount
20
22
  from .system_characteristics import AcceleratorType, SystemCharacteristics
21
23
 
22
24
 
@@ -64,6 +66,25 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
64
66
  str:
65
67
  YAML with the env config for the main container, as a YAML string.
66
68
  """
69
+ if system.accelerator_type == AcceleratorType['GPU']:
70
+ return get_gpu_env(args, system)
71
+
72
+ if system.accelerator_type == AcceleratorType['CPU']:
73
+ return get_cpu_env(args, system)
74
+
75
+ return format_env_dict(args.env, system) # pytype: disable=bad-return-type
76
+
77
+
78
+ def get_gpu_env(args, system) -> str:
79
+ """Generate environment variables for GPU nodepools
80
+ Args:
81
+ num_slices: Number of slices to be used in the workload.
82
+ env_vars: Environment variables, processed from user args.
83
+ system: system characteristics
84
+
85
+ Returns:
86
+ str: yaml containing env variables
87
+ """
67
88
  gpu_env_yaml = """
68
89
  - name: REPLICATED_JOB_NAME
69
90
  valueFrom:
@@ -73,8 +94,6 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
73
94
  valueFrom:
74
95
  fieldRef:
75
96
  fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
76
- - name: JAX_COORDINATOR_ADDRESS
77
- value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
78
97
  - name: NNODES
79
98
  value: "{args.num_nodes}"
80
99
  - name: NODE_RANK
@@ -84,36 +103,37 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
84
103
  - name: USE_GPUDIRECT
85
104
  value: {gpu_direct_name}
86
105
  - name: GPUS_PER_NODE
87
- value: "{system.chips_per_vm}"
88
- - name: JAX_COORDINATOR_PORT
89
- value: "6002"
106
+ value: "{chips_per_vm}"
90
107
  - name: COMMAND
91
108
  value: "{args.command}"
92
- {args.env}"""
93
-
94
- if system.accelerator_type == AcceleratorType['GPU']:
95
- gpu_direct_name = 'fastrak'
96
- if args.device_type == H100_DEVICE_TYPE:
97
- gpu_direct_name = 'tcpx'
98
- gpu_env_yaml += """
99
- - name: LD_LIBRARY_PATH
100
- value: /usr/local/nvidia/lib64
101
- """
102
- elif args.device_type == H100_MEGA_DEVICE_TYPE:
103
- gpu_direct_name = 'tcpxo'
104
- elif args.device_type == H200_DEVICE_TYPE:
105
- gpu_direct_name = 'rdma'
106
- return gpu_env_yaml.format(
107
- args=args, system=system, gpu_direct_name=gpu_direct_name
108
- )
109
-
110
- if system.accelerator_type == AcceleratorType['CPU']:
111
- return get_cpu_env(args.num_slices, args.env, system)
112
-
113
- return args.env # pytype: disable=bad-return-type
109
+ {custom_envs}"""
110
+
111
+ gpu_direct_name = 'fastrak'
112
+ if args.device_type == H100_DEVICE_TYPE:
113
+ gpu_direct_name = 'tcpx'
114
+ elif args.device_type == H100_MEGA_DEVICE_TYPE:
115
+ gpu_direct_name = 'tcpxo'
116
+ elif args.device_type == H200_DEVICE_TYPE:
117
+ gpu_direct_name = 'rdma'
118
+
119
+ gpu_env_dic = {
120
+ 'JAX_COORDINATOR_PORT': '6002',
121
+ 'JAX_COORDINATOR_ADDRESS': (
122
+ '$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)'
123
+ ),
124
+ }
125
+
126
+ args.env = gpu_env_dic | args.env
127
+
128
+ return gpu_env_yaml.format(
129
+ args=args,
130
+ chips_per_vm=system.chips_per_vm,
131
+ gpu_direct_name=gpu_direct_name,
132
+ custom_envs=format_env_dict(args.env, system),
133
+ )
114
134
 
115
135
 
116
- def get_cpu_env(num_slices, env_vars, system) -> str:
136
+ def get_cpu_env(args, system) -> str:
117
137
  """Generate environment variables for CPU nodepools
118
138
  Args:
119
139
  num_slices: Number of slices to be used in the workload.
@@ -136,19 +156,87 @@ def get_cpu_env(num_slices, env_vars, system) -> str:
136
156
  valueFrom:
137
157
  fieldRef:
138
158
  fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
139
- - name: PROCESSES_IN_JOB
140
- value: "{processes_in_job}"
141
- - name: JAX_PROCESS_COUNT
142
- value: "{process_count}"
143
- {env_vars}
144
- - name: JAX_COORDINATOR_ADDRESS
145
- value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
159
+ {custom_envs}
146
160
  """
147
- return yaml.format(
148
- processes_in_job=system.vms_per_slice,
149
- process_count=calculate_process_count(num_slices, system.vms_per_slice),
150
- env_vars=env_vars,
151
- )
161
+
162
+ cpu_env_dic = {
163
+ 'PROCESSES_IN_JOB': str(system.vms_per_slice),
164
+ 'JAX_PROCESS_COUNT': str(
165
+ calculate_process_count(args.num_slices, system.vms_per_slice)
166
+ ),
167
+ 'JAX_COORDINATOR_ADDRESS': (
168
+ '$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)'
169
+ ),
170
+ }
171
+
172
+ args.env = cpu_env_dic | args.env
173
+
174
+ return yaml.format(custom_envs=format_env_dict(args.env, system))
175
+
176
+
177
+ def format_env_dict(env, system: SystemCharacteristics) -> str:
178
+ if system.accelerator_type == AcceleratorType['GPU']:
179
+ # For GPUs, it has two more spaces ahead of name and value respectively
180
+ env_format = '''
181
+ - name: {key}
182
+ value: "{value}"'''
183
+ else:
184
+ env_format = '''
185
+ - name: {key}
186
+ value: "{value}"'''
187
+ return ''.join(env_format.format(key=k, value=v) for k, v in env.items())
188
+
189
+
190
+ def parse_env_config(args, tensorboard_config):
191
+ """Parses the environment configurations to the a dictionary.
192
+
193
+ Args:
194
+ args: user provided arguments for running the command.
195
+ tensorboard_config: configuration of Vertex Tensorboard.
196
+ system: system characteristics.
197
+ """
198
+ env = {}
199
+
200
+ env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
201
+ if args.env_file:
202
+ print('Setting container environment from', args.env_file)
203
+ with open(file=args.env_file, mode='r', encoding='utf-8') as f:
204
+ for match in env_pat.finditer(f.read()):
205
+ variable = match.group(1)
206
+ if match.group(2) is not None:
207
+ env[variable] = match.group(2)
208
+ else:
209
+ assert variable in os.environ, (
210
+ f'Variable {variable} is not set in the current '
211
+ 'environment, a value must be specified.'
212
+ )
213
+ env[variable] = os.environ[variable]
214
+ if args.env:
215
+ for var in args.env:
216
+ match = env_pat.match(var)
217
+ assert match and match.group(2) is not None, (
218
+ 'Invalid environment variable, format must be '
219
+ f'`--env VARIABLE=value`: {var}'
220
+ )
221
+ variable = match.group(1)
222
+ env[variable] = match.group(2)
223
+
224
+ if not args.use_pathways:
225
+ if args.debug_dump_gcs:
226
+ if 'XLA_FLAGS' in env:
227
+ raise ValueError(
228
+ 'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
229
+ 'and environment file. Please choose one way to define '
230
+ 'XLA_FLAGS.'
231
+ )
232
+ env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
233
+
234
+ if tensorboard_config:
235
+ env['UPLOAD_DATA_TO_TENSORBOARD'] = True
236
+ for key, value in tensorboard_config.items():
237
+ env[key.upper()] = value
238
+
239
+ args.env = env
152
240
 
153
241
 
154
242
  def get_volumes(args, system: SystemCharacteristics) -> str:
@@ -188,13 +276,13 @@ def get_volumes(args, system: SystemCharacteristics) -> str:
188
276
  setup_k8s_env(args), args.storage
189
277
  )
190
278
  for storage in storages:
191
- if storage.type == GCS_FUSE_TYPE:
192
- volumes += f"""- name: {storage.pv}
193
- persistentVolumeClaim:
194
- claimName: {storage.pvc}
195
- readOnly: {storage.readonly}
196
- """
197
- if storage.type == GCP_FILESTORE_TYPE:
279
+ if storage.type in {
280
+ GCS_FUSE_TYPE,
281
+ GCP_FILESTORE_TYPE,
282
+ PARALLELSTORE_TYPE,
283
+ GCE_PD_TYPE,
284
+ LUSTRE_TYPE,
285
+ }:
198
286
  volumes += f"""- name: {storage.pv}
199
287
  persistentVolumeClaim:
200
288
  claimName: {storage.pvc}
@@ -235,34 +323,19 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
235
323
  mountPath: /shared-volume
236
324
  """
237
325
  elif system.accelerator_type == AcceleratorType['GPU']:
238
- if system.device_type == H100_DEVICE_TYPE:
239
- volume_mount_yaml = """- name: nvidia-install-dir-host
240
- mountPath: /usr/local/nvidia/lib64
241
- - name: tcpx-nccl-plugin-volume
242
- mountPath: /usr/local/tcpx
243
- - name: tcpd-socket
244
- mountPath: /tmp
245
- - name: shared-memory
246
- mountPath: /dev/shm
247
- - name: workload-terminated-volume
248
- mountPath: /usr/share/workload"""
249
- elif (
250
- system.device_type == H100_MEGA_DEVICE_TYPE
251
- or system.device_type == H200_DEVICE_TYPE
252
- or system.device_type == B200_DEVICE_TYPE
253
- ):
254
- volume_mount_yaml = ''
326
+ volume_mount_yaml = ''
255
327
 
256
328
  storages: list[Storage] = get_storages_to_mount(
257
329
  setup_k8s_env(args), args.storage
258
330
  )
259
331
  for storage in storages:
260
- if storage.type == GCS_FUSE_TYPE:
261
- volume_mount_yaml += f"""- name: {storage.pv}
262
- mountPath: {storage.mount_point}
263
- readOnly: {storage.readonly}
264
- """
265
- if storage.type == GCP_FILESTORE_TYPE:
332
+ if storage.type in {
333
+ GCS_FUSE_TYPE,
334
+ GCP_FILESTORE_TYPE,
335
+ PARALLELSTORE_TYPE,
336
+ GCE_PD_TYPE,
337
+ LUSTRE_TYPE,
338
+ }:
266
339
  volume_mount_yaml += f"""- name: {storage.pv}
267
340
  mountPath: {storage.mount_point}
268
341
  readOnly: {storage.readonly}
xpk/core/jobset.py ADDED
@@ -0,0 +1,143 @@
1
+ """
2
+ Copyright 2024 Google LLC
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ """
16
+
17
+ import math
18
+
19
+ from ..utils.console import xpk_exit, xpk_print
20
+ from ..utils.file import write_tmp_file
21
+ from ..core.kueue import (
22
+ MEMORY_SIZE_PER_VM,
23
+ MIN_MEMORY_LIMIT_SIZE,
24
+ )
25
+ from .commands import (
26
+ run_command_for_value,
27
+ run_command_with_updates_retry,
28
+ )
29
+
30
+ jobset_controller_manager_yml = """
31
+ apiVersion: apps/v1
32
+ kind: Deployment
33
+ metadata:
34
+ labels:
35
+ app.kubernetes.io/component: manager
36
+ app.kubernetes.io/created-by: jobset
37
+ app.kubernetes.io/instance: controller-manager
38
+ app.kubernetes.io/managed-by: kustomize
39
+ app.kubernetes.io/name: deployment
40
+ app.kubernetes.io/part-of: jobset
41
+ control-plane: controller-manager
42
+ name: jobset-controller-manager
43
+ namespace: jobset-system
44
+ spec:
45
+ replicas: 1
46
+ selector:
47
+ matchLabels:
48
+ control-plane: controller-manager
49
+ template:
50
+ metadata:
51
+ annotations:
52
+ kubectl.kubernetes.io/default-container: manager
53
+ labels:
54
+ control-plane: controller-manager
55
+ spec:
56
+ containers:
57
+ - args:
58
+ - --config=/controller_manager_config.yaml
59
+ - --zap-log-level=2
60
+ command:
61
+ - /manager
62
+ image: registry.k8s.io/jobset/jobset:v0.8.0
63
+ livenessProbe:
64
+ httpGet:
65
+ path: /healthz
66
+ port: 8081
67
+ initialDelaySeconds: 15
68
+ periodSeconds: 20
69
+ name: manager
70
+ ports:
71
+ - containerPort: 9443
72
+ name: webhook-server
73
+ protocol: TCP
74
+ readinessProbe:
75
+ httpGet:
76
+ path: /readyz
77
+ port: 8081
78
+ initialDelaySeconds: 5
79
+ periodSeconds: 10
80
+ resources:
81
+ limits:
82
+ memory: {memory_limit_size}
83
+ requests:
84
+ cpu: 500m
85
+ memory: 128Mi
86
+ securityContext:
87
+ allowPrivilegeEscalation: false
88
+ capabilities:
89
+ drop:
90
+ - ALL
91
+ volumeMounts:
92
+ - mountPath: /controller_manager_config.yaml
93
+ name: manager-config
94
+ subPath: controller_manager_config.yaml
95
+ - mountPath: /tmp/k8s-webhook-server/serving-certs
96
+ name: cert
97
+ readOnly: true
98
+ securityContext:
99
+ runAsNonRoot: true
100
+ serviceAccountName: jobset-controller-manager
101
+ terminationGracePeriodSeconds: 10
102
+ volumes:
103
+ - configMap:
104
+ name: jobset-manager-config
105
+ name: manager-config
106
+ - name: cert
107
+ secret:
108
+ defaultMode: 420
109
+ secretName: jobset-webhook-server-cert
110
+ """
111
+
112
+
113
+ def update_jobset_resources_if_necessary(args):
114
+ """Update the jobset manifest to increase the resources for the jobset controller manager.
115
+
116
+ Args:
117
+ args: user provided arguments for running the command.
118
+
119
+ Returns:
120
+ 0 if successful and 1 otherwise.
121
+ """
122
+ # Get total number of nodes
123
+ cmd_total_node_num = 'kubectl get node --no-headers | wc -l'
124
+ return_code, out = run_command_for_value(
125
+ cmd_total_node_num, 'Count total nodes', args
126
+ )
127
+ if return_code != 0:
128
+ xpk_exit(1)
129
+ # 1.2MiB per VM or 4GiB (whichever is greater).
130
+ new_memory_limit = (
131
+ f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi'
132
+ )
133
+ yml_string = jobset_controller_manager_yml.format(
134
+ memory_limit_size=new_memory_limit,
135
+ )
136
+ tmp = write_tmp_file(yml_string)
137
+ command = f'kubectl apply -f {str(tmp.file.name)}'
138
+
139
+ task = 'Updating jobset Controller Manager resources'
140
+ return_code = run_command_with_updates_retry(command, task, args)
141
+ if return_code != 0:
142
+ xpk_print(f'{task} returned ERROR {return_code}')
143
+ return return_code
xpk/core/kjob.py CHANGED
@@ -40,11 +40,8 @@ from .config import (
40
40
  XpkConfig,
41
41
  )
42
42
  from .network import get_cluster_subnetworks
43
- from .resources import (
44
- AcceleratorType,
45
- SystemCharacteristics,
46
- get_cluster_system_characteristics,
47
- )
43
+ from .system_characteristics import AcceleratorType, SystemCharacteristics
44
+ from .resources import get_cluster_system_characteristics
48
45
  from .storage import (
49
46
  GCS_FUSE_ANNOTATIONS,
50
47
  PARALLELSTORE_ANNOTATIONS,
@@ -380,7 +377,6 @@ def prepare_kjob(args: Namespace) -> int:
380
377
  job_err_code = create_job_template_instance(args, system, service_account)
381
378
  if job_err_code > 0:
382
379
  return job_err_code
383
-
384
380
  pod_err_code = create_pod_template_instance(args, service_account)
385
381
  if pod_err_code > 0:
386
382
  return pod_err_code