konduktor-nightly 0.1.0.dev20250804105449__py3-none-any.whl → 0.1.0.dev20250806105405__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of konduktor-nightly might be problematic. Click here for more details.

@@ -15,7 +15,7 @@ if typing.TYPE_CHECKING:
15
15
  from konduktor.data import storage as storage_lib
16
16
 
17
17
  from konduktor import config, logging
18
- from konduktor.backends import backend, jobset_utils
18
+ from konduktor.backends import backend, jobset_utils, pod_utils
19
19
  from konduktor.utils import kubernetes_utils, log_utils, rich_utils, ux_utils
20
20
 
21
21
  Path = str
@@ -172,7 +172,7 @@ class JobsetBackend(backend.Backend):
172
172
  # the working container starts.
173
173
 
174
174
  # first define the pod spec then create the jobset definition
175
- pod_spec = jobset_utils.create_pod_spec(task)
175
+ pod_spec = pod_utils.create_pod_spec(task)
176
176
  context = kubernetes_utils.get_current_kube_config_context_name()
177
177
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
178
178
  # TODO(asaiacai): need to set env variables in pod
@@ -1,32 +1,25 @@
1
1
  """Jobset utils: wraps CRUD operations for jobsets"""
2
2
 
3
- import base64
4
3
  import enum
5
4
  import json
6
- import os
7
5
  import tempfile
8
6
  import typing
9
7
  from datetime import datetime, timezone
10
8
  from typing import Any, Dict, Optional, Tuple
11
- from urllib.parse import urlparse
12
9
 
13
- import click
14
10
  import colorama
15
11
 
16
12
  if typing.TYPE_CHECKING:
17
13
  from datetime import timedelta
18
14
 
19
15
  import konduktor
20
- from konduktor import authentication, config, constants, kube_client, logging
16
+ from konduktor import kube_client, logging
21
17
  from konduktor.backends import constants as backend_constants
22
- from konduktor.data import registry
18
+ from konduktor.backends import pod_utils
23
19
  from konduktor.utils import (
24
20
  common_utils,
25
- exceptions,
26
21
  kubernetes_utils,
27
22
  log_utils,
28
- ux_utils,
29
- validator,
30
23
  )
31
24
 
32
25
  if typing.TYPE_CHECKING:
@@ -38,13 +31,14 @@ JOBSET_API_GROUP = 'jobset.x-k8s.io'
38
31
  JOBSET_API_VERSION = 'v1alpha2'
39
32
  JOBSET_PLURAL = 'jobsets'
40
33
 
41
- JOBSET_NAME_LABEL = 'trainy.ai/job-name'
42
- JOBSET_USERID_LABEL = 'trainy.ai/user-id'
43
- JOBSET_USER_LABEL = 'trainy.ai/username'
44
- JOBSET_ACCELERATOR_LABEL = 'trainy.ai/accelerator'
45
- JOBSET_NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
34
+ # Use shared constants from konduktor.backends.constants
35
+ JOBSET_NAME_LABEL = backend_constants.JOB_NAME_LABEL
36
+ JOBSET_USERID_LABEL = backend_constants.USERID_LABEL
37
+ JOBSET_USER_LABEL = backend_constants.USER_LABEL
38
+ JOBSET_ACCELERATOR_LABEL = backend_constants.ACCELERATOR_LABEL
39
+ JOBSET_NUM_ACCELERATORS_LABEL = backend_constants.NUM_ACCELERATORS_LABEL
46
40
 
47
- SECRET_BASENAME_LABEL = 'konduktor/basename'
41
+ SECRET_BASENAME_LABEL = backend_constants.SECRET_BASENAME_LABEL
48
42
 
49
43
  _JOBSET_METADATA_LABELS = {
50
44
  'jobset_name_label': JOBSET_NAME_LABEL,
@@ -54,8 +48,6 @@ _JOBSET_METADATA_LABELS = {
54
48
  'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
55
49
  }
56
50
 
57
- _RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
58
-
59
51
 
60
52
  class JobNotFoundError(Exception):
61
53
  pass
@@ -73,236 +65,6 @@ if typing.TYPE_CHECKING:
73
65
  import konduktor
74
66
 
75
67
 
76
- def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
77
- """Merges the task defintion with config
78
- to create a final pod spec dict for the job
79
-
80
- Returns:
81
- Dict[str, Any]: k8s pod spec
82
- """
83
- context = kubernetes_utils.get_current_kube_config_context_name()
84
- namespace = kubernetes_utils.get_kube_config_context_namespace(context)
85
-
86
- # fill out the templating variables
87
- assert task.resources is not None, 'Task resources are required'
88
- if task.resources.accelerators:
89
- num_gpus = list(task.resources.accelerators.values())[0]
90
- else:
91
- num_gpus = 0
92
- task.name = f'{task.name}-{common_utils.get_usage_run_id()[:4]}'
93
- node_hostnames = ','.join(
94
- [f'{task.name}-workers-0-{idx}.{task.name}' for idx in range(task.num_nodes)]
95
- )
96
- master_addr = f'{task.name}-workers-0-0.{task.name}'
97
-
98
- if task.resources.accelerators:
99
- accelerator_type = list(task.resources.accelerators.keys())[0]
100
- else:
101
- accelerator_type = None
102
-
103
- assert task.resources.cpus is not None, 'Task resources cpus are required'
104
- assert task.resources.memory is not None, 'Task resources memory are required'
105
- assert task.resources.image_id is not None, 'Task resources image_id are required'
106
-
107
- # template the commands to run on the container for syncing files. At this point
108
- # task.stores is Dict[str, storage_utils.Storage] which is (dst, storage_obj_src)
109
- # first we iterate through storage_mounts and then file_mounts.
110
- sync_commands = []
111
- mkdir_commands = []
112
- storage_secrets = {}
113
- # first do storage_mount sync
114
- for dst, store in task.storage_mounts.items():
115
- # TODO(asaiacai) idk why but theres an extra storage mount for the
116
- # file mounts. Should be cleaned up eventually in
117
- # maybe_translate_local_file_mounts_and_sync_up
118
- assert store.source is not None and isinstance(
119
- store.source, str
120
- ), 'Store source is required'
121
- store_scheme = urlparse(store.source).scheme
122
- if '/tmp/konduktor-job-filemounts-files' in dst:
123
- continue
124
- # should impelement a method here instead of raw dog dict access
125
- cloud_store = registry._REGISTRY[store_scheme]
126
- storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
127
- exists, _ = kubernetes_utils.check_secret_exists(
128
- storage_secrets[store_scheme], namespace=namespace, context=context
129
- )
130
- assert exists, (
131
- f"secret {storage_secrets[store_scheme]} doesn't "
132
- f'exist in namespace {namespace}'
133
- )
134
- mkdir_commands.append(
135
- f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};' f'mkdir -p {dst}'
136
- )
137
- assert store._bucket_sub_path is not None
138
- sync_commands.append(
139
- cloud_store.make_sync_dir_command(
140
- os.path.join(store.source, store._bucket_sub_path), dst
141
- )
142
- )
143
-
144
- # then do file_mount sync.
145
- assert task.file_mounts is not None
146
- for dst, src in task.file_mounts.items():
147
- store_scheme = str(urlparse(store.source).scheme)
148
- cloud_store = registry._REGISTRY[store_scheme]
149
- mkdir_commands.append(
150
- f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};'
151
- f'mkdir -p {os.path.dirname(dst)}'
152
- )
153
- storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
154
- exists, reason = kubernetes_utils.check_secret_exists(
155
- storage_secrets[store_scheme], namespace=namespace, context=context
156
- )
157
- assert exists, (
158
- f'secret {storage_secrets[store_scheme]} '
159
- f"doesn't exist in namespace {namespace}"
160
- )
161
- sync_commands.append(cloud_store.make_sync_file_command(src, dst))
162
-
163
- tailscale_secret = config.get_nested(('tailscale', 'secret_name'), None)
164
- if tailscale_secret:
165
- secret_exist, err = kubernetes_utils.check_secret_exists(
166
- tailscale_secret, namespace, context
167
- )
168
- if not secret_exist:
169
- with ux_utils.print_exception_no_traceback():
170
- raise exceptions.MissingSecretError(
171
- f'No tailscale auth-key secret `{tailscale_secret}` found even '
172
- f'though specified by `tailscale.secret_name`: {err}'
173
- )
174
-
175
- enable_ssh = config.get_nested(('ssh', 'enable'), False) or tailscale_secret
176
- secret_name = None
177
- if enable_ssh:
178
- private_key_path, public_key_path = authentication.get_or_generate_keys()
179
- with (
180
- open(private_key_path, 'rb') as private_key_file,
181
- open(public_key_path, 'rb') as public_key_file,
182
- ):
183
- private_key, public_key = private_key_file.read(), public_key_file.read()
184
- user_hash = common_utils.get_user_hash()
185
- secret_name = f'konduktor-ssh-keys-{user_hash}'
186
- ok, result = kubernetes_utils.set_secret(
187
- secret_name=secret_name,
188
- namespace=namespace,
189
- context=context,
190
- data={
191
- 'PUBKEY': base64.b64encode(public_key).decode(),
192
- 'PRIVKEY': base64.b64encode(private_key).decode(),
193
- },
194
- )
195
- if not ok:
196
- raise exceptions.CreateSecretError(
197
- f'Failed to set k8s secret {secret_name}: \n{result}'
198
- )
199
-
200
- # Mount the user's secrets
201
- git_ssh_secret_name = None
202
- env_secret_envs = []
203
- default_secrets = []
204
-
205
- user_hash = common_utils.get_user_hash()
206
- label_selector = f'konduktor/owner={user_hash}'
207
- user_secrets = kubernetes_utils.list_secrets(
208
- namespace, context, label_filter=label_selector
209
- )
210
-
211
- for secret in user_secrets:
212
- kind = kubernetes_utils.get_secret_kind(secret)
213
- if kind == 'git-ssh' and git_ssh_secret_name is None:
214
- git_ssh_secret_name = secret.metadata.name
215
- elif kind == 'env':
216
- env_secret_name = secret.metadata.name
217
- key = next(iter(secret.data))
218
- env_secret_envs.append(
219
- {
220
- 'name': key,
221
- 'valueFrom': {
222
- 'secretKeyRef': {'name': env_secret_name, 'key': key}
223
- },
224
- }
225
- )
226
- elif kind == 'default':
227
- default_secret_name = secret.metadata.name
228
- basename = secret.metadata.labels.get(
229
- SECRET_BASENAME_LABEL, default_secret_name
230
- )
231
- default_secrets.append(
232
- {'k8s_name': default_secret_name, 'mount_name': basename}
233
- )
234
-
235
- with tempfile.NamedTemporaryFile() as temp:
236
- common_utils.fill_template(
237
- 'pod.yaml.j2',
238
- {
239
- # TODO(asaiacai) need to parse/round these numbers and sanity check
240
- 'cpu': kubernetes_utils.parse_cpu_or_gpu_resource(task.resources.cpus),
241
- 'memory': kubernetes_utils.parse_memory_resource(task.resources.memory),
242
- 'image_id': task.resources.image_id,
243
- 'num_gpus': num_gpus,
244
- 'master_addr': master_addr,
245
- 'num_nodes': task.num_nodes,
246
- 'job_name': task.name, # append timestamp and user id here?
247
- 'setup_cmd': task.setup or '',
248
- 'run_cmd': task.run,
249
- 'node_hostnames': node_hostnames,
250
- 'accelerator_type': accelerator_type,
251
- 'sync_commands': sync_commands,
252
- 'mkdir_commands': mkdir_commands,
253
- 'mount_secrets': storage_secrets,
254
- 'remote_workdir': constants.KONDUKTOR_REMOTE_WORKDIR,
255
- 'user': common_utils.get_cleaned_username(),
256
- # Tailscale credentials
257
- 'tailscale_secret': tailscale_secret,
258
- # SSH
259
- 'enable_ssh': enable_ssh,
260
- 'secret_name': secret_name,
261
- 'konduktor_ssh_port': backend_constants.KONDUKTOR_SSH_PORT,
262
- # Kinds of Secrets
263
- # --kind git-ssh
264
- 'git_ssh': git_ssh_secret_name,
265
- # --kind default
266
- 'default_secrets': default_secrets,
267
- # KONDUKTOR_DEBUG
268
- 'konduktor_debug': os.getenv('KONDUKTOR_DEBUG', 0),
269
- },
270
- temp.name,
271
- )
272
- pod_config = common_utils.read_yaml(temp.name)
273
- # merge with `~/.konduktor/config.yaml``
274
- kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
275
- pod_config = common_utils.read_yaml(temp.name)
276
-
277
- # Priority order: task.envs > secret envs > existing pod_config envs
278
- existing_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
279
- 'env', []
280
- )
281
- env_map = {env['name']: env for env in existing_envs}
282
-
283
- # Inject secret envs
284
- for env in env_secret_envs:
285
- env_map[env['name']] = env
286
-
287
- # Inject task.envs
288
- for k, v in task.envs.items():
289
- env_map[k] = {'name': k, 'value': v}
290
-
291
- # Replace the container's env section with the merged and prioritized map
292
- pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = list(
293
- env_map.values()
294
- )
295
- logger.debug(f'rendered pod spec: \n\t{pod_config}')
296
-
297
- # validate pod spec using json schema
298
- try:
299
- validator.validate_pod_spec(pod_config['kubernetes']['pod_config']['spec'])
300
- except ValueError as e:
301
- raise click.UsageError(str(e))
302
-
303
- return pod_config
304
-
305
-
306
68
  def create_jobset(
307
69
  namespace: str,
308
70
  task: 'konduktor.Task',
@@ -313,12 +75,8 @@ def create_jobset(
313
75
  and returns the created jobset spec
314
76
  """
315
77
  assert task.resources is not None, 'Task resources are undefined'
316
- if task.resources.accelerators:
317
- accelerator_type = list(task.resources.accelerators.keys())[0]
318
- num_accelerators = list(task.resources.accelerators.values())[0]
319
- else:
320
- accelerator_type = 'None'
321
- num_accelerators = 0
78
+ accelerator_type = task.resources.get_accelerator_type() or 'None'
79
+ num_accelerators = task.resources.get_accelerator_count() or 0
322
80
  with tempfile.NamedTemporaryFile() as temp:
323
81
  common_utils.fill_template(
324
82
  'jobset.yaml.j2',
@@ -336,19 +94,11 @@ def create_jobset(
336
94
  temp.name,
337
95
  )
338
96
  jobset_spec = common_utils.read_yaml(temp.name)
339
- jobset_spec['jobset']['metadata']['labels'].update(
340
- **(task.resources.labels or {})
341
- )
342
- assert task.resources.labels is not None
343
- maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
344
- if not maxRunDurationSeconds:
345
- raise ValueError('maxRunDurationSeconds is required')
346
- jobset_spec['jobset']['metadata']['annotations'][
347
- _RUN_DURATION_ANNOTATION_KEY
348
- ] = str(maxRunDurationSeconds)
349
- jobset_spec['jobset']['spec']['replicatedJobs'][0]['template']['spec'][
350
- 'template'
351
- ] = pod_spec # noqa: E501
97
+ # Inject JobSet metadata (labels and annotations)
98
+ pod_utils.inject_jobset_metadata(jobset_spec, task)
99
+
100
+ # Merge pod spec into JobSet template
101
+ pod_utils.merge_pod_into_jobset_template(jobset_spec, pod_spec)
352
102
  try:
353
103
  context = kubernetes_utils.get_current_kube_config_context_name()
354
104
  jobset = kube_client.crd_api(context=context).create_namespaced_custom_object(