konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,501 @@
1
+ """Pod utils: handles pod spec creation and manipulation"""
2
+
3
+ import base64
4
+ import json
5
+ import os
6
+ import tempfile
7
+ import typing
8
+ from typing import Any, Dict
9
+ from urllib.parse import urlparse
10
+
11
+ import click
12
+
13
+ import konduktor
14
+ from konduktor import authentication, config, constants, logging
15
+ from konduktor.backends import constants as backend_constants
16
+ from konduktor.data import registry
17
+ from konduktor.utils import (
18
+ common_utils,
19
+ exceptions,
20
+ kubernetes_utils,
21
+ ux_utils,
22
+ validator,
23
+ )
24
+
25
+ if typing.TYPE_CHECKING:
26
+ pass
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+ _RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
31
+
32
+
33
+ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
34
+ """Merges the task definition with config to create a final pod spec dict.
35
+
36
+ This function is shared between JobSets and Deployments.
37
+
38
+ Returns:
39
+ Dict[str, Any]: k8s pod spec
40
+ """
41
+ context = kubernetes_utils.get_current_kube_config_context_name()
42
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
43
+
44
+ # fill out the templating variables
45
+ assert task.resources is not None, 'Task resources are required'
46
+ num_gpus = task.resources.get_accelerator_count() or 0
47
+ task.name = f'{task.name}-{common_utils.get_usage_run_id()[:4]}'
48
+ node_hostnames = ','.join(
49
+ [f'{task.name}-workers-0-{idx}.{task.name}' for idx in range(task.num_nodes)]
50
+ )
51
+ master_addr = f'{task.name}-workers-0-0.{task.name}'
52
+
53
+ accelerator_type = task.resources.get_accelerator_type()
54
+
55
+ assert task.resources.cpus is not None, 'Task resources cpus are required'
56
+ assert task.resources.memory is not None, 'Task resources memory are required'
57
+ assert task.resources.image_id is not None, 'Task resources image_id are required'
58
+
59
+ # template the commands to run on the container for syncing files. At this point
60
+ # task.stores is Dict[str, storage_utils.Storage] which is (dst, storage_obj_src)
61
+ # first we iterate through storage_mounts and then file_mounts.
62
+ sync_commands = []
63
+ mkdir_commands = []
64
+ storage_secrets = {}
65
+ # first do storage_mount sync
66
+ for dst, store in task.storage_mounts.items():
67
+ # TODO(asaiacai) idk why but theres an extra storage mount for the
68
+ # file mounts. Should be cleaned up eventually in
69
+ # maybe_translate_local_file_mounts_and_sync_up
70
+ assert store.source is not None and isinstance(
71
+ store.source, str
72
+ ), 'Store source is required'
73
+ store_scheme = urlparse(store.source).scheme
74
+ if '/tmp/konduktor-job-filemounts-files' in dst:
75
+ continue
76
+ # should impelement a method here instead of raw dog dict access
77
+ cloud_store = registry._REGISTRY[store_scheme]
78
+ storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
79
+ exists, _ = kubernetes_utils.check_secret_exists(
80
+ storage_secrets[store_scheme], namespace=namespace, context=context
81
+ )
82
+ assert exists, (
83
+ f"secret {storage_secrets[store_scheme]} doesn't "
84
+ f'exist in namespace {namespace}'
85
+ )
86
+ mkdir_commands.append(
87
+ f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};' f'mkdir -p {dst}'
88
+ )
89
+ assert store._bucket_sub_path is not None
90
+ sync_commands.append(
91
+ cloud_store.make_sync_dir_command(
92
+ os.path.join(store.source, store._bucket_sub_path), dst
93
+ )
94
+ )
95
+
96
+ # then do file_mount sync.
97
+ assert task.file_mounts is not None
98
+ for dst, src in task.file_mounts.items():
99
+ store_scheme = str(urlparse(store.source).scheme)
100
+ cloud_store = registry._REGISTRY[store_scheme]
101
+ mkdir_commands.append(
102
+ f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};'
103
+ f'mkdir -p {os.path.dirname(dst)}'
104
+ )
105
+ storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
106
+ exists, reason = kubernetes_utils.check_secret_exists(
107
+ storage_secrets[store_scheme], namespace=namespace, context=context
108
+ )
109
+ assert exists, (
110
+ f'secret {storage_secrets[store_scheme]} '
111
+ f"doesn't exist in namespace {namespace}"
112
+ )
113
+ sync_commands.append(cloud_store.make_sync_file_command(src, dst))
114
+
115
+ tailscale_secret = config.get_nested(('tailscale', 'secret_name'), None)
116
+ if tailscale_secret:
117
+ secret_exist, err = kubernetes_utils.check_secret_exists(
118
+ tailscale_secret, namespace, context
119
+ )
120
+ if not secret_exist:
121
+ with ux_utils.print_exception_no_traceback():
122
+ raise exceptions.MissingSecretError(
123
+ f'No tailscale auth-key secret `{tailscale_secret}` found even '
124
+ f'though specified by `tailscale.secret_name`: {err}'
125
+ )
126
+
127
+ enable_ssh = config.get_nested(('ssh', 'enable'), False) or tailscale_secret
128
+ secret_name = None
129
+ if enable_ssh:
130
+ private_key_path, public_key_path = authentication.get_or_generate_keys()
131
+ with (
132
+ open(private_key_path, 'rb') as private_key_file,
133
+ open(public_key_path, 'rb') as public_key_file,
134
+ ):
135
+ private_key, public_key = private_key_file.read(), public_key_file.read()
136
+ user_hash = common_utils.get_user_hash()
137
+ secret_name = f'konduktor-ssh-keys-{user_hash}'
138
+ ok, result = kubernetes_utils.set_secret(
139
+ secret_name=secret_name,
140
+ namespace=namespace,
141
+ context=context,
142
+ data={
143
+ 'PUBKEY': base64.b64encode(public_key).decode(),
144
+ 'PRIVKEY': base64.b64encode(private_key).decode(),
145
+ },
146
+ )
147
+ if not ok:
148
+ raise exceptions.CreateSecretError(
149
+ f'Failed to set k8s secret {secret_name}: \n{result}'
150
+ )
151
+
152
+ # Mount the user's secrets
153
+ git_ssh_secret_name = None
154
+ env_secret_envs = []
155
+ default_secrets = []
156
+ basename_by_k8s: Dict[str, str] = {}
157
+
158
+ # only get own secrets
159
+ user_hash = common_utils.get_user_hash()
160
+ label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
161
+ user_secrets = kubernetes_utils.list_secrets(
162
+ namespace, context, label_filter=label_selector
163
+ )
164
+
165
+ for secret in user_secrets:
166
+ kind = kubernetes_utils.get_secret_kind(secret)
167
+
168
+ # incase the user modified their secret to have no key:value data
169
+ if secret.data is None:
170
+ secret.data = {}
171
+
172
+ # fill the map for *all* secrets we see
173
+ k8s_name = secret.metadata.name
174
+ lbls = secret.metadata.labels or {}
175
+ base = lbls.get(
176
+ backend_constants.SECRET_BASENAME_LABEL,
177
+ # fallback: strip trailing "-<something>" once if present
178
+ k8s_name.rsplit('-', 1)[0] if '-' in k8s_name else k8s_name,
179
+ )
180
+ basename_by_k8s[k8s_name] = base
181
+
182
+ if kind == 'git-ssh' and git_ssh_secret_name is None:
183
+ git_ssh_secret_name = secret.metadata.name
184
+ elif kind == 'env':
185
+ env_secret_name = secret.metadata.name
186
+ # iterate ALL keys, not just one (ex. if user made a multi-key env secret)
187
+ for key, _ in secret.data.items():
188
+ # wire the env var to read its value from a k8s secret
189
+ env_secret_envs.append(
190
+ {
191
+ 'name': key,
192
+ 'valueFrom': {
193
+ 'secretKeyRef': {'name': env_secret_name, 'key': key}
194
+ },
195
+ }
196
+ )
197
+ elif kind == 'default':
198
+ default_secret_name = secret.metadata.name
199
+ basename = secret.metadata.labels.get(
200
+ backend_constants.SECRET_BASENAME_LABEL, default_secret_name
201
+ )
202
+ default_secrets.append(
203
+ {'k8s_name': default_secret_name, 'mount_name': basename}
204
+ )
205
+
206
+ # Check if the task references KONDUKTOR_DEFAULT_SECRETS and that it exists
207
+ uses_default_secret_var = (
208
+ 'KONDUKTOR_DEFAULT_SECRETS' in (task.run or '')
209
+ or 'KONDUKTOR_DEFAULT_SECRETS' in (task.setup or '')
210
+ or '/konduktor/default-secrets/' in (task.run or '')
211
+ or '/konduktor/default-secrets/' in (task.setup or '')
212
+ )
213
+ if uses_default_secret_var and not default_secrets:
214
+ raise exceptions.MissingSecretError(
215
+ f'Task references KONDUKTOR_DEFAULT_SECRETS or '
216
+ f'/konduktor/default-secrets but '
217
+ f'user {common_utils.get_cleaned_username()} '
218
+ f'has no default secrets. Paths like '
219
+ f'$KONDUKTOR_DEFAULT_SECRETS/<secret_name>/... will not exist.'
220
+ )
221
+
222
+ # Inject --served-model-name, --host, and --port into serving run command
223
+ if task.serving and task.run and 'vllm.entrypoints.openai.api_server' in task.run:
224
+ if '--served-model-name' and '--host' and '--port' not in task.run:
225
+ task.run = task.run.replace(
226
+ '--model',
227
+ (
228
+ f'--served-model-name {task.name} \\\n'
229
+ f" --host '0.0.0.0' \\\n"
230
+ f" --port '{task.serving.ports}' \\\n"
231
+ f' --model'
232
+ ),
233
+ )
234
+ elif '--served-model-name' in task.run:
235
+ raise ValueError(
236
+ 'Error creating vllm deployment: '
237
+ '--served-model-name flag should be excluded from run command'
238
+ )
239
+ elif '--host' in task.run:
240
+ raise ValueError(
241
+ 'Error creating vllm deployment: '
242
+ '--host flag should be excluded from run command'
243
+ )
244
+ else:
245
+ raise ValueError(
246
+ 'Error creating vllm deployment: '
247
+ '--port flag should be excluded from run command'
248
+ )
249
+
250
+ general = True
251
+ if task.run and 'vllm.entrypoints.openai.api_server' in task.run:
252
+ general = False
253
+
254
+ with tempfile.NamedTemporaryFile() as temp:
255
+ common_utils.fill_template(
256
+ 'pod.yaml.j2',
257
+ {
258
+ # TODO(asaiacai) need to parse/round these numbers and sanity check
259
+ 'cpu': kubernetes_utils.parse_cpu_or_gpu_resource(
260
+ str(task.resources.cpus or '')
261
+ ),
262
+ 'memory': kubernetes_utils.parse_memory_resource(
263
+ task.resources.memory or ''
264
+ ),
265
+ 'image_id': task.resources.image_id,
266
+ 'num_gpus': num_gpus,
267
+ 'master_addr': master_addr,
268
+ 'num_nodes': task.num_nodes,
269
+ 'job_name': task.name, # append timestamp and user id here?
270
+ 'setup_cmd': task.setup or '',
271
+ 'run_cmd': task.run or '',
272
+ 'node_hostnames': node_hostnames,
273
+ 'accelerator_type': accelerator_type,
274
+ 'sync_commands': sync_commands,
275
+ 'mkdir_commands': mkdir_commands,
276
+ 'mount_secrets': storage_secrets,
277
+ 'remote_workdir': constants.KONDUKTOR_REMOTE_WORKDIR,
278
+ 'user': common_utils.get_cleaned_username(),
279
+ # Tailscale credentials
280
+ 'tailscale_secret': tailscale_secret,
281
+ # SSH
282
+ 'enable_ssh': enable_ssh,
283
+ 'secret_name': secret_name,
284
+ # Serving
285
+ 'serving': bool(task.serving),
286
+ 'general': general,
287
+ 'ports': task.serving.ports if task.serving else None,
288
+ 'probe': task.serving.probe if task.serving else None,
289
+ 'konduktor_ssh_port': backend_constants.KONDUKTOR_SSH_PORT,
290
+ # Kinds of Secrets
291
+ # --kind git-ssh
292
+ 'git_ssh': git_ssh_secret_name,
293
+ # --kind default
294
+ 'default_secrets': default_secrets,
295
+ # KONDUKTOR_DEBUG
296
+ 'konduktor_debug': os.getenv('KONDUKTOR_DEBUG', 0),
297
+ },
298
+ temp.name,
299
+ )
300
+
301
+ # Capture the template env names BEFORE user config is merged
302
+ pod_config_template = common_utils.read_yaml(temp.name)
303
+ tmpl_envs = pod_config_template['kubernetes']['pod_config']['spec'][
304
+ 'containers'
305
+ ][0].get('env', [])
306
+ tmpl_env_names = {e['name'] for e in tmpl_envs}
307
+
308
+ pod_config = common_utils.read_yaml(temp.name)
309
+ # merge with `~/.konduktor/config.yaml`` (config.yaml overrides template)
310
+ kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
311
+ pod_config = common_utils.read_yaml(temp.name)
312
+
313
+ # Find what came from user config (appeared after combine, not in template)
314
+ premerge_envs = pod_config['kubernetes']['pod_config']['spec']['containers'][0].get(
315
+ 'env', []
316
+ )
317
+ premerge_names = {e['name'] for e in premerge_envs}
318
+ config_env_names0 = premerge_names - tmpl_env_names
319
+
320
+ # Build final env list
321
+ env_map = {env['name']: env for env in premerge_envs}
322
+
323
+ # Inject secret envs (env secrets override config.yaml)
324
+ for env in env_secret_envs:
325
+ env_map[env['name']] = env
326
+
327
+ # Inject task envs
328
+ # CLI+task.yaml overrides everything else
329
+ # CLI already overrode task.yaml in other code
330
+ for k, v in task.envs.items():
331
+ env_map[k] = {'name': k, 'value': v}
332
+
333
+ final_envs_list = list(env_map.values())
334
+ pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = (
335
+ final_envs_list
336
+ )
337
+ container = pod_config['kubernetes']['pod_config']['spec']['containers'][0]
338
+ final_envs = container['env']
339
+ final_names = {e['name'] for e in final_envs}
340
+
341
+ logger.debug(f'rendered pod spec: \n\t{json.dumps(pod_config, indent=2)}')
342
+
343
+ # 1) Get secret envs actually used in the final env list
344
+ secret_details = sorted(
345
+ (e['name'], e['valueFrom']['secretKeyRef']['name'])
346
+ for e in final_envs
347
+ if isinstance(e, dict)
348
+ and e.get('valueFrom', {})
349
+ and e['valueFrom'].get('secretKeyRef')
350
+ )
351
+ secret_names = [n for n, _ in secret_details]
352
+
353
+ # 2) Get task-sourced (CLI+task.yaml) envs actually used in the final env list
354
+ task_all_names = sorted(
355
+ n
356
+ for n in (task.envs or {}).keys()
357
+ if n in final_names and n not in secret_names
358
+ )
359
+
360
+ # 3) Get Config.yaml envs actually used in the final env list
361
+ config_names = sorted(
362
+ n
363
+ for n in config_env_names0
364
+ if n in final_names and n not in secret_names and n not in task_all_names
365
+ )
366
+
367
+ # 4) Get other envs (template/system) actually used in the final env list
368
+ other_names = sorted(
369
+ final_names - set(secret_names) - set(task_all_names) - set(config_names)
370
+ )
371
+
372
+ # Export helper envs for the startup script (names only)
373
+ def _append_helper(name: str, values):
374
+ container['env'].append({'name': name, 'value': ','.join(values)})
375
+
376
+ # to show user basenames of k8s secrets instead of actual
377
+ # k8s secret names (which have added suffixes)
378
+ secret_map_pairs = [
379
+ f'{var}={basename_by_k8s.get(secret_k8s, secret_k8s)}'
380
+ for (var, secret_k8s) in secret_details
381
+ ]
382
+
383
+ # Priority order: CLI > task.yaml > env secret > config > template/system
384
+ _append_helper(
385
+ 'KONDUKTOR_ENV_SECRETS_HOPEFULLY_NO_NAME_COLLISION',
386
+ secret_names,
387
+ )
388
+ _append_helper(
389
+ 'KONDUKTOR_ENV_SECRETS_MAP_HOPEFULLY_NO_NAME_COLLISION',
390
+ secret_map_pairs,
391
+ )
392
+ _append_helper(
393
+ 'KONDUKTOR_ENV_TASK_ALL_HOPEFULLY_NO_NAME_COLLISION',
394
+ task_all_names,
395
+ )
396
+ _append_helper(
397
+ 'KONDUKTOR_ENV_CONFIG_HOPEFULLY_NO_NAME_COLLISION',
398
+ config_names,
399
+ )
400
+ _append_helper(
401
+ 'KONDUKTOR_ENV_OTHER_HOPEFULLY_NO_NAME_COLLISION',
402
+ other_names,
403
+ )
404
+
405
+ # validate pod spec using json schema
406
+ try:
407
+ validator.validate_pod_spec(pod_config['kubernetes']['pod_config']['spec'])
408
+ except ValueError as e:
409
+ raise click.UsageError(str(e))
410
+
411
+ return pod_config
412
+
413
+
414
+ def inject_deployment_pod_metadata(
415
+ pod_spec: Dict[str, Any], task: 'konduktor.Task'
416
+ ) -> None:
417
+ """Inject deployment-specific metadata into pod spec.
418
+
419
+ This function adds deployment-specific labels, annotations, and settings
420
+ that are not present in the basic pod spec used for JobSets.
421
+
422
+ Args:
423
+ pod_spec: The pod spec dictionary to modify
424
+ task: The task object containing resource information
425
+ """
426
+ # Ensure metadata structure exists
427
+ pod_spec.setdefault('metadata', {})
428
+ pod_spec['metadata'].setdefault('labels', {})
429
+ pod_spec['metadata'].setdefault('annotations', {})
430
+
431
+ # Determine deployment type
432
+ deployment_type = 'general'
433
+ if task.run and 'vllm.entrypoints.openai.api_server' in task.run:
434
+ deployment_type = 'vllm'
435
+
436
+ # Add deployment-specific label for vllm deployments only
437
+ if deployment_type == 'vllm':
438
+ pod_spec['metadata']['labels'][backend_constants.AIBRIX_NAME_LABEL] = task.name
439
+
440
+ # Add deployment-specific label for all deployments
441
+ pod_spec['metadata']['labels'][backend_constants.DEPLOYMENT_NAME_LABEL] = task.name
442
+
443
+ # Add resource labels
444
+ if task.resources and task.resources.labels:
445
+ pod_spec['metadata']['labels'].update(task.resources.labels)
446
+
447
+ # Set restart policy for deployments
448
+ pod_spec.setdefault('spec', {})
449
+ pod_spec['spec']['restartPolicy'] = 'Always'
450
+
451
+
452
+ def merge_pod_into_deployment_template(
453
+ deployment_spec: Dict[str, Any], pod_spec: Dict[str, Any]
454
+ ) -> None:
455
+ """Merge a pod spec into a deployment template.
456
+
457
+ Args:
458
+ deployment_spec: The deployment spec dictionary to modify
459
+ pod_spec: The pod spec to merge into the deployment template
460
+ """
461
+ deployment_spec['template'] = pod_spec
462
+
463
+
464
+ def inject_jobset_metadata(jobset_spec: Dict[str, Any], task: 'konduktor.Task') -> None:
465
+ """Inject JobSet-specific pod metadata.
466
+
467
+ This function adds JobSet-specific annotations that are not present
468
+ in the basic pod spec.
469
+
470
+ Args:
471
+ jobset_spec: The JobSet spec dictionary to modify
472
+ task: The task object containing resource information
473
+ """
474
+ # Add max run duration annotation
475
+ assert (
476
+ task.resources is not None and task.resources.labels is not None
477
+ ), 'Task resources and task.resources.labels are required'
478
+ maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
479
+ if not maxRunDurationSeconds:
480
+ raise ValueError('maxRunDurationSeconds is required')
481
+ jobset_spec['jobset']['metadata']['annotations'][_RUN_DURATION_ANNOTATION_KEY] = (
482
+ str(maxRunDurationSeconds)
483
+ )
484
+
485
+ # Inject resource labels into JobSet metadata.
486
+ if task.resources and task.resources.labels:
487
+ jobset_spec['jobset']['metadata']['labels'].update(task.resources.labels)
488
+
489
+
490
+ def merge_pod_into_jobset_template(
491
+ jobset_spec: Dict[str, Any], pod_spec: Dict[str, Any]
492
+ ) -> None:
493
+ """Merge a pod spec into a JobSet template.
494
+
495
+ Args:
496
+ jobset_spec: The JobSet spec dictionary to modify
497
+ pod_spec: The pod spec to merge into the JobSet template
498
+ """
499
+ jobset_spec['jobset']['spec']['replicatedJobs'][0]['template']['spec'][
500
+ 'template'
501
+ ] = pod_spec
konduktor/check.py ADDED
@@ -0,0 +1,184 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Credential checks: check cloud credentials and enable clouds.
14
+
15
+ Our architecture is client-server and requires that credentials are stored
16
+ as a secret in the cluster. This makes it so that cluster admins can just
17
+ deploy credentials (s3, gcs, r2) once to the namespace. Users then during job
18
+ use the secret stored for mounting credentials to pods. Users running must also
19
+ have the credentials present on their local machine, otherwise they won't be able to
20
+ upload files to object storage.
21
+
22
+ We have to check that the credentials are valid on the client side.
23
+ If the check fails, then we will attempt to check the credentials present on the client.
24
+ If these credentials are valid, we update the secret on the cluster, and
25
+ run the job as usual.
26
+ If these credentials are not valid, we fail the job and alert the user.
27
+
28
+ """
29
+
30
+ import traceback
31
+ import typing
32
+ from typing import Iterable, List, Optional, Tuple
33
+
34
+ import click
35
+ import colorama
36
+
37
+ from konduktor import config as konduktor_config
38
+ from konduktor import logging
39
+ from konduktor.data import registry
40
+ from konduktor.utils import rich_utils
41
+
42
+ if typing.TYPE_CHECKING:
43
+ from konduktor.data import storage_utils
44
+
45
+ logger = logging.get_logger(__name__)
46
+
47
+
48
+ def check(
49
+ quiet: bool = False,
50
+ clouds: Optional[Iterable[str]] = None,
51
+ ) -> List[str]:
52
+ echo = (
53
+ (lambda *_args, **_kwargs: None)
54
+ if quiet
55
+ else lambda *args, **kwargs: click.echo(*args, **kwargs, color=True)
56
+ )
57
+ echo('Checking credentials to enable clouds storage for Konduktor.')
58
+ enabled_clouds = []
59
+ disabled_clouds = []
60
+
61
+ def check_one_cloud(
62
+ cloud_tuple: Tuple[str, 'storage_utils.CloudStorage'],
63
+ ) -> None:
64
+ cloud_repr, cloud = cloud_tuple
65
+ with rich_utils.safe_status(f'Checking {cloud_repr}...'):
66
+ try:
67
+ logger.info(f'Checking {cloud_repr} local client credentials...')
68
+ ok, reason = cloud.check_credentials()
69
+ except Exception: # pylint: disable=broad-except
70
+ # Catch all exceptions to prevent a single cloud from blocking
71
+ # the check for other clouds.
72
+ ok, reason = False, traceback.format_exc()
73
+ status_msg = 'enabled' if ok else 'disabled'
74
+ styles = {'fg': 'green', 'bold': False} if ok else {'dim': True}
75
+ echo(' ' + click.style(f'{cloud_repr}: {status_msg}', **styles) + ' ' * 30)
76
+ if ok:
77
+ enabled_clouds.append(cloud_repr)
78
+ if reason is not None:
79
+ echo(f' Hint: {reason}')
80
+ else:
81
+ disabled_clouds.append(cloud_repr)
82
+ echo(f' Reason: {reason}')
83
+
84
+ def get_cloud_tuple(cloud_name: str) -> Tuple[str, 'storage_utils.CloudStorage']:
85
+ # Validates cloud_name and returns a tuple of the cloud's name and
86
+ # the cloud object. Includes special handling for Cloudflare.
87
+ cloud_obj = registry._REGISTRY.get(cloud_name, None)
88
+ assert cloud_obj is not None, f'Cloud {cloud_name!r} not found'
89
+ return cloud_name, cloud_obj
90
+
91
+ def get_all_clouds():
92
+ return tuple([c for c in registry._REGISTRY.keys()])
93
+
94
+ if clouds is not None:
95
+ cloud_list = clouds
96
+ else:
97
+ cloud_list = get_all_clouds()
98
+ clouds_to_check = [get_cloud_tuple(c) for c in cloud_list]
99
+
100
+ # Use allowed_clouds from config if it exists, otherwise check all clouds.
101
+ # Also validate names with get_cloud_tuple.
102
+ config_allowed_cloud_names = [
103
+ c for c in konduktor_config.get_nested(('allowed_clouds',), get_all_clouds())
104
+ ]
105
+ # Use disallowed_cloud_names for logging the clouds that will be disabled
106
+ # because they are not included in allowed_clouds in config.yaml.
107
+ disallowed_cloud_names = [
108
+ c for c in get_all_clouds() if c not in config_allowed_cloud_names
109
+ ]
110
+ # Check only the clouds which are allowed in the config.
111
+ clouds_to_check = [c for c in clouds_to_check if c[0] in config_allowed_cloud_names]
112
+
113
+ for cloud_tuple in sorted(clouds_to_check):
114
+ check_one_cloud(cloud_tuple)
115
+
116
+ # Cloudflare is not a real cloud in registry.CLOUD_REGISTRY, and should
117
+ # not be inserted into the DB (otherwise `sky launch` and other code would
118
+ # error out when it's trying to look it up in the registry).
119
+ enabled_clouds_set = {
120
+ cloud for cloud in enabled_clouds if not cloud.startswith('Cloudflare')
121
+ }
122
+ disabled_clouds_set = {
123
+ cloud for cloud in disabled_clouds if not cloud.startswith('Cloudflare')
124
+ }
125
+
126
+ # Determine the set of enabled clouds: (previously enabled clouds + newly
127
+ # enabled clouds - newly disabled clouds) intersected with
128
+ # config_allowed_clouds, if specified in config.yaml.
129
+ # This means that if a cloud is already enabled and is not included in
130
+ # allowed_clouds in config.yaml, it will be disabled.
131
+ all_enabled_clouds = enabled_clouds_set - disabled_clouds_set
132
+
133
+ disallowed_clouds_hint = None
134
+ if disallowed_cloud_names:
135
+ disallowed_clouds_hint = (
136
+ '\nNote: The following clouds were disabled because they were not '
137
+ 'included in allowed_clouds in ~/.konduktor/config.yaml: '
138
+ f'{", ".join([c for c in disallowed_cloud_names])}'
139
+ )
140
+ if not all_enabled_clouds:
141
+ echo(
142
+ click.style(
143
+ 'No cloud is enabled. Konduktor will not be able to run any '
144
+ 'task. Run `konduktor check` for more info.',
145
+ fg='red',
146
+ bold=True,
147
+ )
148
+ )
149
+ if disallowed_clouds_hint:
150
+ echo(click.style(disallowed_clouds_hint, dim=True))
151
+ raise SystemExit()
152
+ else:
153
+ clouds_arg = ' ' + ' '.join(disabled_clouds) if clouds is not None else ''
154
+ echo(
155
+ click.style(
156
+ '\nTo enable a cloud, follow the hints above and rerun: ', dim=True
157
+ )
158
+ + click.style(f'konduktor check {clouds_arg}', bold=True)
159
+ + '\n'
160
+ + click.style(
161
+ 'If any problems remain, refer to detailed docs at: '
162
+ 'https://trainy.mintlify.app', # pylint: disable=line-too-long
163
+ dim=True,
164
+ )
165
+ )
166
+
167
+ if disallowed_clouds_hint:
168
+ echo(click.style(disallowed_clouds_hint, dim=True))
169
+
170
+ # Pretty print for UX.
171
+ if not quiet:
172
+ enabled_clouds_str = '\n ' + '\n '.join(
173
+ [_format_enabled_storage(cloud) for cloud in sorted(all_enabled_clouds)]
174
+ )
175
+ echo(
176
+ f'\n{colorama.Fore.GREEN}{logging.PARTY_POPPER_EMOJI} '
177
+ f'Enabled clouds {logging.PARTY_POPPER_EMOJI}'
178
+ f'{colorama.Style.RESET_ALL}{enabled_clouds_str}'
179
+ )
180
+ return enabled_clouds
181
+
182
+
183
+ def _format_enabled_storage(cloud_name: str) -> str:
184
+ return f'{colorama.Fore.GREEN}{cloud_name}{colorama.Style.RESET_ALL}'