konduktor-nightly 0.1.0.dev20250325104729__py3-none-any.whl → 0.1.0.dev20250327104656__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
konduktor/__init__.py CHANGED
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = 'eeee2ab274425758fdc33addddadd0d9270b9c59'
17
+ _KONDUKTOR_COMMIT_SHA = 'ac5a9936d3c1b6a76bffd3f660d68ac80634bda5'
18
18
 
19
19
 
20
20
  def _get_git_commit():
@@ -47,5 +47,5 @@ def _get_git_commit():
47
47
 
48
48
 
49
49
  __commit__ = _get_git_commit()
50
- __version__ = '1.0.0.dev0.1.0.dev20250325104729'
50
+ __version__ = '1.0.0.dev0.1.0.dev20250327104656'
51
51
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -30,6 +30,7 @@ JOBSET_USER_LABEL = 'trainy.ai/username'
30
30
  JOBSET_ACCELERATOR_LABEL = 'trainy.ai/accelerator'
31
31
  JOBSET_NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
32
32
 
33
+
33
34
  _JOBSET_METADATA_LABELS = {
34
35
  'jobset_name_label': JOBSET_NAME_LABEL,
35
36
  'jobset_userid_label': JOBSET_USERID_LABEL,
@@ -37,6 +38,8 @@ _JOBSET_METADATA_LABELS = {
37
38
  'jobset_accelerator_label': JOBSET_ACCELERATOR_LABEL,
38
39
  'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
39
40
  }
41
+ _RUN_DURATION_ANNOTATION = 'maxRunDurationSeconds'
42
+ _RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
40
43
 
41
44
 
42
45
  class JobNotFoundError(Exception):
@@ -196,6 +199,14 @@ def create_jobset(
196
199
  temp.name,
197
200
  )
198
201
  jobset_spec = common_utils.read_yaml(temp.name)
202
+ jobset_spec['jobset']['metadata']['labels'].update(**task.resources.labels)
203
+ assert task.resources.labels is not None
204
+ maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
205
+ if not maxRunDurationSeconds:
206
+ raise ValueError('maxRunDurationSeconds is required')
207
+ jobset_spec['jobset']['metadata']['annotations'][
208
+ _RUN_DURATION_ANNOTATION_KEY
209
+ ] = str(maxRunDurationSeconds)
199
210
  jobset_spec['jobset']['spec']['replicatedJobs'][0]['template']['spec'][
200
211
  'template'
201
212
  ] = pod_spec # noqa: E501
@@ -13,6 +13,7 @@ jobset:
13
13
  {% endif %}
14
14
  trainy.ai/konduktor-managed: "true"
15
15
  parent: "trainy"
16
+ annotations: {}
16
17
  spec:
17
18
  ttlSecondsAfterFinished: 259200 # 3 days
18
19
  replicatedJobs:
@@ -71,10 +71,13 @@ kubernetes:
71
71
  [ $(id -u) -eq 0 ] && function sudo() { "$@"; } || true;
72
72
 
73
73
 
74
- # Run apt update, install missing packages
75
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > ~/.konduktor/tmp/apt-update.log 2>&1 || \
76
- $(prefix_cmd) echo "Warning: apt-get update failed. Continuing anyway..." >> ~/.konduktor/tmp/apt-update.log
77
- PACKAGES="rsync curl";
74
+ PACKAGES="";
75
+ {% if 'rsync' in run_cmd %}
76
+ PACKAGES="$PACKAGES rsync";
77
+ {% endif %}
78
+ {% if 'curl' in run_cmd %}
79
+ PACKAGES="$PACKAGES curl";
80
+ {% endif %}
78
81
  {% if 'gs' in mount_secrets %}
79
82
  PACKAGES="$PACKAGES unzip wget";
80
83
  {% endif %}
@@ -82,6 +85,13 @@ kubernetes:
82
85
  PACKAGES="$PACKAGES git";
83
86
  {% endif %}
84
87
 
88
+ if [ -z "${PACKAGES}" ]; then
89
+ # Run apt update, install missing packages
90
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > ~/.konduktor/tmp/apt-update.log 2>&1 || \
91
+ $(prefix_cmd) echo "Warning: apt-get update failed. Continuing anyway..." >> ~/.konduktor/tmp/apt-update.log
92
+ fi
93
+
94
+
85
95
  # Separate packages into two groups: packages that are installed first
86
96
  # so that curl and rsync are available sooner to unblock the following
87
97
  # conda installation and rsync.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250325104729
3
+ Version: 0.1.0.dev20250327104656
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -1,11 +1,11 @@
1
- konduktor/__init__.py,sha256=hGgR0afNh8_cDfglkkxiHKUeT1F2GMUcHrOp1eO49fE,1477
1
+ konduktor/__init__.py,sha256=CWG3yvw6TSNGuQHtxiuStEVcTZ-0X7So3cCPi0XyJks,1477
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/common.py,sha256=mYb_6c3u5MghtiFfiW5OO-EH6t7cIR5npbkgUmz6FYE,3517
4
4
  konduktor/adaptors/gcp.py,sha256=liCm4_D_qSci0DZA2t5bckLIoGDkJ8qx31EO_hSBzo0,3751
5
5
  konduktor/backends/__init__.py,sha256=1Q6sqqdeMYarpTX_U-QVywJYf7idiUTRsyP-E4BQSOw,129
6
6
  konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
7
7
  konduktor/backends/jobset.py,sha256=lh_PihQgM0tmVryCpjSsZjWug8hBnJr7ua9lqk0qEAM,8251
8
- konduktor/backends/jobset_utils.py,sha256=FR_IDoDU8noTE1qSG-L0KAIe52geeGtekzhInnmjgwc,16636
8
+ konduktor/backends/jobset_utils.py,sha256=1LOiP-UVtezbLP0jbtQR7pwBQH3B5b5sqptVVzri-48,17222
9
9
  konduktor/check.py,sha256=hIrxDMKaGX2eZP-Pj9TCymGUHQAp93m48Gj3XMiqadA,7833
10
10
  konduktor/cli.py,sha256=90bnh3nIobfBkzqS_SXgw9Z8Zqh4ouwpLDj0kx_6kL8,23562
11
11
  konduktor/cloud_stores.py,sha256=KX3u5YlXGslMCe_q8zYtFy62_KGCmmLTrYuK7Y9jFIM,6277
@@ -66,8 +66,8 @@ konduktor/manifests/dmesg_daemonset.yaml,sha256=pSWt7YOeTYjS0l0iki1fvHOs7MhY-sH-
66
66
  konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw1vcq9_3gwHCu75Bi0A,3939
67
67
  konduktor/resource.py,sha256=68z8gC8Ivqktwv0R6ylMn9ZNocgkcRT0yIRGGKOdwcM,18491
68
68
  konduktor/task.py,sha256=edHgMLYECGux6WLCilqsNZNYr3dEcw_miWvu4FYpu5U,34713
69
- konduktor/templates/jobset.yaml.j2,sha256=NevmZYDUBQbzVHiQ6EzlWX8FzdHLcz1bcLxOvD03PKQ,940
70
- konduktor/templates/pod.yaml.j2,sha256=zrYwxTyAFmjh6NtMmiGaOZBFwqCBZW2dRex4RpLh4iE,8142
69
+ konduktor/templates/jobset.yaml.j2,sha256=onYiHtXAgk-XBtji994hPu_g0hxnLzvmfxwjbdKdeZc,960
70
+ konduktor/templates/pod.yaml.j2,sha256=XLQ2dD7jq9yeF-eKtweXaMrHWj5cFI-DIZwcJ3qkANQ,8433
71
71
  konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
72
  konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
73
73
  konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -87,8 +87,8 @@ konduktor/utils/schemas.py,sha256=4Goihc-NpFQpiJ7RSiKirAIPNWqw_DV_TRqVwejqTDY,17
87
87
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
88
88
  konduktor/utils/ux_utils.py,sha256=NPNu3Igu2Z9Oq77ghJhy_fIxQZTXWr9BtKyxN3Wslzo,7164
89
89
  konduktor/utils/validator.py,sha256=tgBghVyedyzGx84-U2Qfoh_cJBE3oUk9gclMW90ORks,691
90
- konduktor_nightly-0.1.0.dev20250325104729.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
91
- konduktor_nightly-0.1.0.dev20250325104729.dist-info/METADATA,sha256=-geNGSpx5h4Z6PlPEaewftsvSLgOQ0lULcWY40MUpzY,4070
92
- konduktor_nightly-0.1.0.dev20250325104729.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
93
- konduktor_nightly-0.1.0.dev20250325104729.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
94
- konduktor_nightly-0.1.0.dev20250325104729.dist-info/RECORD,,
90
+ konduktor_nightly-0.1.0.dev20250327104656.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
91
+ konduktor_nightly-0.1.0.dev20250327104656.dist-info/METADATA,sha256=EX_G_loycMXJbLC5OQc05ikBf0RbbGbA2zMGia3xNyc,4070
92
+ konduktor_nightly-0.1.0.dev20250327104656.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
93
+ konduktor_nightly-0.1.0.dev20250327104656.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
94
+ konduktor_nightly-0.1.0.dev20250327104656.dist-info/RECORD,,