konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,581 @@
1
+ """This module contains schemas used to validate objects.
2
+
3
+ Schemas conform to the JSON Schema specification as defined at
4
+ https://json-schema.org/
5
+ """
6
+
7
+ import enum
8
+ from typing import Any, Dict, List, Tuple
9
+
10
+ OVERRIDEABLE_CONFIG_KEYS: List[Tuple[str, ...]] = [
11
+ ('kubernetes', 'pod_config'),
12
+ ('kubernetes', 'provision_timeout'),
13
+ ]
14
+
15
+
16
+ def _check_not_both_fields_present(field1: str, field2: str):
17
+ return {
18
+ 'oneOf': [
19
+ {'required': [field1], 'not': {'required': [field2]}},
20
+ {'required': [field2], 'not': {'required': [field1]}},
21
+ {'not': {'anyOf': [{'required': [field1]}, {'required': [field2]}]}},
22
+ ]
23
+ }
24
+
25
+
26
+ def _get_single_resources_schema():
27
+ """Schema for a single resource in a resources list."""
28
+ # To avoid circular imports, only import when needed.
29
+ # pylint: disable=import-outside-toplevel
30
+ return {
31
+ '$schema': 'https://json-schema.org/draft/2020-12/schema',
32
+ 'type': 'object',
33
+ 'required': [],
34
+ 'additionalProperties': False,
35
+ 'properties': {
36
+ 'cpus': {
37
+ 'anyOf': [
38
+ {
39
+ 'type': 'string',
40
+ },
41
+ {
42
+ 'type': 'number',
43
+ },
44
+ ],
45
+ },
46
+ 'memory': {
47
+ 'anyOf': [
48
+ {
49
+ 'type': 'string',
50
+ },
51
+ {
52
+ 'type': 'number',
53
+ },
54
+ ],
55
+ },
56
+ 'accelerators': {
57
+ 'anyOf': [
58
+ {
59
+ 'type': 'string',
60
+ },
61
+ {
62
+ 'type': 'object',
63
+ 'required': [],
64
+ 'maxProperties': 1,
65
+ 'additionalProperties': {'type': 'number'},
66
+ },
67
+ ]
68
+ },
69
+ 'disk_size': {
70
+ 'type': 'integer',
71
+ },
72
+ 'labels': {'type': 'object', 'additionalProperties': {'type': 'string'}},
73
+ 'image_id': {
74
+ 'anyOf': [
75
+ {
76
+ 'type': 'string',
77
+ },
78
+ {
79
+ 'type': 'object',
80
+ 'required': [],
81
+ },
82
+ {
83
+ 'type': 'null',
84
+ },
85
+ ]
86
+ },
87
+ '_cluster_config_overrides': {
88
+ 'type': 'object',
89
+ },
90
+ },
91
+ }
92
+
93
+
94
+ def _get_multi_resources_schema():
95
+ multi_resources_schema = {
96
+ k: v
97
+ for k, v in _get_single_resources_schema().items()
98
+ # Validation may fail if $schema is included.
99
+ if k != '$schema'
100
+ }
101
+ return multi_resources_schema
102
+
103
+
104
+ def get_resources_schema():
105
+ """Resource schema in task config."""
106
+ single_resources_schema = _get_single_resources_schema()['properties']
107
+ single_resources_schema.pop('accelerators')
108
+ multi_resources_schema = _get_multi_resources_schema()
109
+ return {
110
+ '$schema': 'http://json-schema.org/draft-07/schema#',
111
+ 'type': 'object',
112
+ 'required': [],
113
+ 'additionalProperties': False,
114
+ 'properties': {
115
+ **single_resources_schema,
116
+ # We redefine the 'accelerators' field to allow one line list or
117
+ # a set of accelerators.
118
+ 'accelerators': {
119
+ # {'V100:1', 'A100:1'} will be
120
+ # read as a string and converted to dict.
121
+ 'anyOf': [
122
+ {
123
+ 'type': 'string',
124
+ },
125
+ {
126
+ 'type': 'object',
127
+ 'required': [],
128
+ 'additionalProperties': {
129
+ 'anyOf': [
130
+ {
131
+ 'type': 'null',
132
+ },
133
+ {
134
+ 'type': 'number',
135
+ },
136
+ ]
137
+ },
138
+ },
139
+ {
140
+ 'type': 'array',
141
+ 'items': {
142
+ 'type': 'string',
143
+ },
144
+ },
145
+ ]
146
+ },
147
+ 'any_of': {
148
+ 'type': 'array',
149
+ 'items': multi_resources_schema,
150
+ },
151
+ 'ordered': {
152
+ 'type': 'array',
153
+ 'items': multi_resources_schema,
154
+ },
155
+ },
156
+ # Avoid job_recovery and spot_recovery being present at the same time.
157
+ **_check_not_both_fields_present('job_recovery', 'spot_recovery'),
158
+ }
159
+
160
+
161
+ def _filter_schema(schema: dict, keys_to_keep: List[Tuple[str, ...]]) -> dict:
162
+ """Recursively filter a schema to include only certain keys.
163
+
164
+ Args:
165
+ schema: The original schema dictionary.
166
+ keys_to_keep: List of tuples with the path of keys to retain.
167
+
168
+ Returns:
169
+ The filtered schema.
170
+ """
171
+ # Convert list of tuples to a dictionary for easier access
172
+ paths_dict: Dict[str, Any] = {}
173
+ for path in keys_to_keep:
174
+ current = paths_dict
175
+ for step in path:
176
+ if step not in current:
177
+ current[step] = {}
178
+ current = current[step]
179
+
180
+ def keep_keys(
181
+ current_schema: dict, current_path_dict: dict, new_schema: dict
182
+ ) -> dict:
183
+ # Base case: if we reach a leaf in the path_dict, we stop.
184
+ if (
185
+ not current_path_dict
186
+ or not isinstance(current_schema, dict)
187
+ or not current_schema.get('properties')
188
+ ):
189
+ return current_schema
190
+
191
+ if 'properties' not in new_schema:
192
+ new_schema = {
193
+ key: current_schema[key]
194
+ for key in current_schema
195
+ # We do not support the handling of `oneOf`, `anyOf`, `allOf`,
196
+ # `required` for now.
197
+ if key not in {'properties', 'oneOf', 'anyOf', 'allOf', 'required'}
198
+ }
199
+ new_schema['properties'] = {}
200
+ for key, sub_schema in current_schema['properties'].items():
201
+ if key in current_path_dict:
202
+ # Recursively keep keys if further path dict exists
203
+ new_schema['properties'][key] = {}
204
+ current_path_value = current_path_dict.pop(key)
205
+ new_schema['properties'][key] = keep_keys(
206
+ sub_schema, current_path_value, new_schema['properties'][key]
207
+ )
208
+
209
+ return new_schema
210
+
211
+ # Start the recursive filtering
212
+ new_schema = keep_keys(schema, paths_dict, {})
213
+ assert not paths_dict, f'Unprocessed keys: {paths_dict}'
214
+ return new_schema
215
+
216
+
217
+ def _experimental_task_schema() -> dict:
218
+ config_override_schema = _filter_schema(
219
+ get_config_schema(), OVERRIDEABLE_CONFIG_KEYS
220
+ )
221
+ return {
222
+ 'experimental': {
223
+ 'type': 'object',
224
+ 'required': [],
225
+ 'additionalProperties': False,
226
+ 'properties': {
227
+ 'config_overrides': config_override_schema,
228
+ },
229
+ }
230
+ }
231
+
232
+
233
+ def get_task_schema():
234
+ return {
235
+ '$schema': 'https://json-schema.org/draft/2020-12/schema',
236
+ 'type': 'object',
237
+ 'required': [],
238
+ 'additionalProperties': False,
239
+ 'properties': {
240
+ 'name': {
241
+ 'type': 'string',
242
+ },
243
+ 'workdir': {
244
+ 'type': 'string',
245
+ },
246
+ 'event_callback': {
247
+ 'type': 'string',
248
+ },
249
+ 'num_nodes': {
250
+ 'type': 'integer',
251
+ },
252
+ # resources config is validated separately using RESOURCES_SCHEMA
253
+ 'resources': {
254
+ 'type': 'object',
255
+ },
256
+ # storage config is validated separately using STORAGE_SCHEMA
257
+ 'file_mounts': {
258
+ 'type': 'object',
259
+ },
260
+ # service config is validated separately using SERVICE_SCHEMA
261
+ 'service': {
262
+ 'type': 'object',
263
+ },
264
+ 'setup': {
265
+ 'type': 'string',
266
+ },
267
+ 'run': {
268
+ 'type': 'string',
269
+ },
270
+ 'envs': {
271
+ 'type': 'object',
272
+ 'required': [],
273
+ 'patternProperties': {
274
+ # Checks env keys are valid env var names.
275
+ '^[a-zA-Z_][a-zA-Z0-9_]*$': {'type': ['string', 'null']}
276
+ },
277
+ 'additionalProperties': False,
278
+ },
279
+ # inputs and outputs are experimental
280
+ 'inputs': {
281
+ 'type': 'object',
282
+ 'required': [],
283
+ 'maxProperties': 1,
284
+ 'additionalProperties': {'type': 'number'},
285
+ },
286
+ 'outputs': {
287
+ 'type': 'object',
288
+ 'required': [],
289
+ 'maxProperties': 1,
290
+ 'additionalProperties': {'type': 'number'},
291
+ },
292
+ **_experimental_task_schema(),
293
+ },
294
+ }
295
+
296
+
297
+ def get_cluster_schema():
298
+ return {
299
+ '$schema': 'https://json-schema.org/draft/2020-12/schema',
300
+ 'type': 'object',
301
+ 'required': ['cluster', 'auth'],
302
+ 'additionalProperties': False,
303
+ 'properties': {
304
+ 'cluster': {
305
+ 'type': 'object',
306
+ 'required': ['ips', 'name'],
307
+ 'additionalProperties': False,
308
+ 'properties': {
309
+ 'ips': {
310
+ 'type': 'array',
311
+ 'items': {
312
+ 'type': 'string',
313
+ },
314
+ },
315
+ 'name': {
316
+ 'type': 'string',
317
+ },
318
+ },
319
+ },
320
+ 'auth': {
321
+ 'type': 'object',
322
+ 'required': ['ssh_user', 'ssh_private_key'],
323
+ 'additionalProperties': False,
324
+ 'properties': {
325
+ 'ssh_user': {
326
+ 'type': 'string',
327
+ },
328
+ 'ssh_private_key': {
329
+ 'type': 'string',
330
+ },
331
+ },
332
+ },
333
+ 'python': {
334
+ 'type': 'string',
335
+ },
336
+ },
337
+ }
338
+
339
+
340
+ _NETWORK_CONFIG_SCHEMA = {
341
+ 'vpc_name': {
342
+ 'oneOf': [
343
+ {
344
+ 'type': 'string',
345
+ },
346
+ {
347
+ 'type': 'null',
348
+ },
349
+ ],
350
+ },
351
+ 'use_internal_ips': {
352
+ 'type': 'boolean',
353
+ },
354
+ 'ssh_proxy_command': {
355
+ 'oneOf': [
356
+ {
357
+ 'type': 'string',
358
+ },
359
+ {
360
+ 'type': 'null',
361
+ },
362
+ {
363
+ 'type': 'object',
364
+ 'required': [],
365
+ 'additionalProperties': {
366
+ 'anyOf': [
367
+ {'type': 'string'},
368
+ {'type': 'null'},
369
+ ]
370
+ },
371
+ },
372
+ ]
373
+ },
374
+ }
375
+
376
+ _LABELS_SCHEMA = {
377
+ # Deprecated: 'instance_tags' is replaced by 'labels'. Keeping for backward
378
+ # compatibility. Will be removed after 0.8.0.
379
+ 'instance_tags': {
380
+ 'type': 'object',
381
+ 'required': [],
382
+ 'additionalProperties': {
383
+ 'type': 'string',
384
+ },
385
+ },
386
+ 'labels': {
387
+ 'type': 'object',
388
+ 'required': [],
389
+ 'additionalProperties': {
390
+ 'type': 'string',
391
+ },
392
+ },
393
+ }
394
+
395
+ _PRORPERTY_NAME_OR_CLUSTER_NAME_TO_PROPERTY = {
396
+ 'oneOf': [
397
+ {'type': 'string'},
398
+ {
399
+ # A list of single-element dict to pretain the
400
+ # order.
401
+ # Example:
402
+ # property_name:
403
+ # - my-cluster1-*: my-property-1
404
+ # - my-cluster2-*: my-property-2
405
+ # - "*"": my-property-3
406
+ 'type': 'array',
407
+ 'items': {
408
+ 'type': 'object',
409
+ 'additionalProperties': {'type': 'string'},
410
+ 'maxProperties': 1,
411
+ 'minProperties': 1,
412
+ },
413
+ },
414
+ ]
415
+ }
416
+
417
+
418
+ class RemoteIdentityOptions(enum.Enum):
419
+ """Enum for remote identity types.
420
+
421
+ Some clouds (e.g., AWS, Kubernetes) also allow string values for remote
422
+ identity, which map to the service account/role to use. Those are not
423
+ included in this enum.
424
+ """
425
+
426
+ LOCAL_CREDENTIALS = 'LOCAL_CREDENTIALS'
427
+ SERVICE_ACCOUNT = 'SERVICE_ACCOUNT'
428
+ NO_UPLOAD = 'NO_UPLOAD'
429
+
430
+
431
+ def get_default_remote_identity(cloud: str) -> str:
432
+ """Get the default remote identity for the specified cloud."""
433
+ if cloud == 'kubernetes':
434
+ return RemoteIdentityOptions.SERVICE_ACCOUNT.value
435
+ return RemoteIdentityOptions.LOCAL_CREDENTIALS.value
436
+
437
+
438
+ _REMOTE_IDENTITY_SCHEMA = {
439
+ 'remote_identity': {
440
+ 'type': 'string',
441
+ 'case_insensitive_enum': [option.value for option in RemoteIdentityOptions],
442
+ }
443
+ }
444
+
445
+ _REMOTE_IDENTITY_SCHEMA_KUBERNETES = {
446
+ 'remote_identity': {
447
+ 'anyOf': [
448
+ {'type': 'string'},
449
+ {'type': 'object', 'additionalProperties': {'type': 'string'}},
450
+ ]
451
+ },
452
+ }
453
+
454
+
455
+ def get_storage_schema():
456
+ # pylint: disable=import-outside-toplevel
457
+ from konduktor import cloud_stores
458
+ from konduktor.data import storage
459
+
460
+ return {
461
+ '$schema': 'https://json-schema.org/draft/2020-12/schema',
462
+ 'type': 'object',
463
+ 'required': [],
464
+ 'additionalProperties': False,
465
+ 'properties': {
466
+ 'name': {
467
+ 'type': 'string',
468
+ },
469
+ 'source': {
470
+ 'anyOf': [
471
+ {
472
+ 'type': 'string',
473
+ },
474
+ {'type': 'array', 'minItems': 1, 'items': {'type': 'string'}},
475
+ ]
476
+ },
477
+ 'store': {
478
+ 'type': 'string',
479
+ 'case_insensitive_enum': [type for type in cloud_stores._REGISTRY],
480
+ },
481
+ 'persistent': {
482
+ 'type': 'boolean',
483
+ },
484
+ 'mode': {
485
+ 'type': 'string',
486
+ 'case_insensitive_enum': [mode.value for mode in storage.StorageMode],
487
+ },
488
+ '_bucket_sub_path': {
489
+ 'type': 'string',
490
+ },
491
+ '_force_delete': {
492
+ 'type': 'boolean',
493
+ },
494
+ },
495
+ }
496
+
497
+
498
+ def get_config_schema():
499
+ # pylint: disable=import-outside-toplevel
500
+ from konduktor import cloud_stores
501
+ from konduktor.utils import kubernetes_enums
502
+
503
+ cloud_configs = {
504
+ 'kubernetes': {
505
+ 'type': 'object',
506
+ 'required': [],
507
+ 'additionalProperties': False,
508
+ 'properties': {
509
+ 'pod_config': {
510
+ 'type': 'object',
511
+ 'required': [],
512
+ # Allow arbitrary keys since validating pod spec is hard
513
+ 'additionalProperties': True,
514
+ },
515
+ 'custom_metadata': {
516
+ 'type': 'object',
517
+ 'required': [],
518
+ # Allow arbitrary keys since validating metadata is hard
519
+ 'additionalProperties': True,
520
+ # Disallow 'name' and 'namespace' keys in this dict
521
+ 'not': {
522
+ 'anyOf': [{'required': ['name']}, {'required': ['namespace']}]
523
+ },
524
+ },
525
+ 'provision_timeout': {
526
+ 'type': 'integer',
527
+ },
528
+ 'autoscaler': {
529
+ 'type': 'string',
530
+ 'case_insensitive_enum': [
531
+ type.value for type in kubernetes_enums.KubernetesAutoscalerType
532
+ ],
533
+ },
534
+ },
535
+ },
536
+ }
537
+
538
+ admin_policy_schema = {
539
+ 'type': 'string',
540
+ # Check regex to be a valid python module path
541
+ 'pattern': (r'^[a-zA-Z_][a-zA-Z0-9_]*' r'(\.[a-zA-Z_][a-zA-Z0-9_]*)+$'),
542
+ }
543
+
544
+ allowed_clouds = {
545
+ # A list of cloud names that are allowed to be used
546
+ 'type': 'array',
547
+ 'required': ['items'],
548
+ 'items': {
549
+ 'type': 'string',
550
+ 'case_insensitive_enum': (list(cloud_stores._REGISTRY.keys())),
551
+ },
552
+ }
553
+
554
+ gpu_configs = {
555
+ 'type': 'object',
556
+ 'required': [],
557
+ 'additionalProperties': False,
558
+ 'properties': {
559
+ 'disable_ecc': {
560
+ 'type': 'boolean',
561
+ },
562
+ },
563
+ }
564
+
565
+ for cloud, config in cloud_configs.items():
566
+ if cloud == 'kubernetes':
567
+ config['properties'].update(_REMOTE_IDENTITY_SCHEMA_KUBERNETES)
568
+ else:
569
+ config['properties'].update(_REMOTE_IDENTITY_SCHEMA)
570
+ return {
571
+ '$schema': 'https://json-schema.org/draft/2020-12/schema',
572
+ 'type': 'object',
573
+ 'required': [],
574
+ 'additionalProperties': False,
575
+ 'properties': {
576
+ 'admin_policy': admin_policy_schema,
577
+ 'nvidia_gpus': gpu_configs,
578
+ 'allowed_clouds': allowed_clouds,
579
+ **cloud_configs,
580
+ },
581
+ }