konduktor-nightly 0.1.0.dev20250603105033__py3-none-any.whl → 0.1.0.dev20250605105049__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
konduktor/__init__.py CHANGED
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = 'cf8484c6676903ded1f8aa2758f8cac533329c05'
17
+ _KONDUKTOR_COMMIT_SHA = 'b5ac51935edc296ff721f11d112e64438b366608'
18
18
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
19
19
 
20
20
 
@@ -48,5 +48,5 @@ def _get_git_commit():
48
48
 
49
49
 
50
50
  __commit__ = _get_git_commit()
51
- __version__ = '1.0.0.dev0.1.0.dev20250603105033'
51
+ __version__ = '1.0.0.dev0.1.0.dev20250605105049'
52
52
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -300,6 +300,7 @@ def create_jobset(
300
300
  'accelerator_type': accelerator_type,
301
301
  'num_accelerators': num_accelerators,
302
302
  'completions': task.resources.get_completions(),
303
+ 'max_restarts': task.resources.get_max_restarts(),
303
304
  **_JOBSET_METADATA_LABELS,
304
305
  },
305
306
  temp.name,
konduktor/cli.py CHANGED
@@ -102,6 +102,7 @@ def _make_task_with_overrides(
102
102
  memory: Optional[str] = None,
103
103
  instance_type: Optional[str] = None,
104
104
  num_nodes: Optional[int] = None,
105
+ max_restarts: Optional[int] = None,
105
106
  image_id: Optional[str] = None,
106
107
  disk_size: Optional[int] = None,
107
108
  env: Optional[List[Tuple[str, str]]] = None,
@@ -147,6 +148,10 @@ def _make_task_with_overrides(
147
148
 
148
149
  task.set_resources_override(override_params)
149
150
 
151
+ if max_restarts is not None:
152
+ assert task.resources is not None
153
+ task.resources.job_config['max_restarts'] = max_restarts
154
+
150
155
  if num_nodes is not None:
151
156
  task.num_nodes = num_nodes
152
157
  if name is not None:
@@ -185,6 +190,16 @@ _TASK_OPTIONS = [
185
190
  'supplied.'
186
191
  ),
187
192
  ),
193
+ click.option(
194
+ '--max-restarts',
195
+ required=False,
196
+ type=int,
197
+ help=(
198
+ 'Maximum number of jobset restarts allowed. Overrides YAML.'
199
+ 'Overrides the "max_restarts" config in the YAML if both are '
200
+ 'supplied.'
201
+ ),
202
+ ),
188
203
  click.option(
189
204
  '--cpus',
190
205
  default=None,
@@ -631,6 +646,7 @@ def launch(
631
646
  cpus: Optional[str],
632
647
  memory: Optional[str],
633
648
  num_nodes: Optional[int],
649
+ max_restarts: Optional[int],
634
650
  image_id: Optional[str],
635
651
  env_file: Optional[Dict[str, str]],
636
652
  env: List[Tuple[str, str]],
@@ -654,6 +670,7 @@ def launch(
654
670
  cpus=cpus,
655
671
  memory=memory,
656
672
  num_nodes=num_nodes,
673
+ max_restarts=max_restarts,
657
674
  image_id=image_id,
658
675
  env=env,
659
676
  disk_size=disk_size,
@@ -962,7 +979,7 @@ def create(kind, from_file, from_directory, inline, name):
962
979
  old_name = s.metadata.name
963
980
  click.echo(f'Found existing git-ssh secret: {old_name}, deleting it.')
964
981
  kubernetes_utils.delete_secret(
965
- secret_name=old_name, namespace=namespace, context=context
982
+ name=old_name, namespace=namespace, context=context
966
983
  )
967
984
  break
968
985
 
konduktor/resource.py CHANGED
@@ -124,7 +124,7 @@ class Resources:
124
124
  self._set_cpus(cpus)
125
125
  self._set_memory(memory)
126
126
  self._set_accelerators(accelerators)
127
- self.job_config = job_config
127
+ self.job_config = job_config or {}
128
128
 
129
129
  # TODO: move these out of init to prevent repeated calls.
130
130
  self._try_validate_cpus_mem()
@@ -386,8 +386,17 @@ class Resources:
386
386
  return accel_str
387
387
 
388
388
  def get_completions(self) -> Optional[int]:
389
- if self.job_config and self.job_config['completions']:
390
- return int(self.job_config['completions'])
389
+ value = self.job_config.get('completions')
390
+ return int(value) if value is not None else None
391
+
392
+ def get_max_restarts(self) -> Optional[int]:
393
+ value = self.job_config.get('max_restarts')
394
+ if value is not None:
395
+ value = int(value)
396
+ if value < 0:
397
+ with ux_utils.print_exception_no_traceback():
398
+ raise ValueError('max_restarts must be a non-negative integer')
399
+ return value
391
400
  return None
392
401
 
393
402
  def copy(self, **override) -> 'Resources':
@@ -16,6 +16,10 @@ jobset:
16
16
  annotations: {}
17
17
  spec:
18
18
  ttlSecondsAfterFinished: 259200 # 3 days
19
+ {% if max_restarts %}
20
+ failurePolicy:
21
+ maxRestarts: {{ max_restarts }}
22
+ {% endif %}
19
23
  replicatedJobs:
20
24
  - name: workers
21
25
  template:
@@ -183,8 +183,8 @@ kubernetes:
183
183
 
184
184
  # Set root password if SSHKEY is provided
185
185
  # Enable root login in SSH configuration
186
- $(prefix_cmd) sed -i '/^#PermitRootLogin/c\PermitRootLogin yes' /etc/ssh/sshd_config
187
- $(prefix_cmd) sed -i '/^PermitRootLogin/c\PermitRootLogin yes' /etc/ssh/sshd_config
186
+ $(prefix_cmd) sed -i '/^#PermitRootLogin/c\PermitRootLogin without-password' /etc/ssh/sshd_config
187
+ $(prefix_cmd) sed -i '/^PermitRootLogin/c\PermitRootLogin without-password' /etc/ssh/sshd_config
188
188
  $(prefix_cmd) echo "Root login is enabled."
189
189
 
190
190
  # Create the .ssh directory and authorized_keys file if they don't exist
@@ -1,13 +1,6 @@
1
1
  """Accelerator registry."""
2
2
 
3
- _ACCELERATORS = [
4
- 'A100',
5
- 'A100-80GB',
6
- 'B200',
7
- 'H100',
8
- 'H200',
9
- 'L40S',
10
- ]
3
+ _ACCELERATORS = ['A100', 'A100-80GB', 'B200', 'H100', 'H200', 'L40S', 'T4']
11
4
 
12
5
 
13
6
  def canonicalize_accelerator_name(accelerator: str) -> str:
@@ -427,6 +427,9 @@ def get_job_schema():
427
427
  'completions': {
428
428
  'type': 'number',
429
429
  },
430
+ 'max_restarts': {
431
+ 'type': 'integer',
432
+ },
430
433
  },
431
434
  }
432
435
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250603105033
3
+ Version: 0.1.0.dev20250605105049
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -1,4 +1,4 @@
1
- konduktor/__init__.py,sha256=ChNdvAf2A-WmTHORSS31GGfP_dFREGg45VqQjZov4bo,1540
1
+ konduktor/__init__.py,sha256=6W1DQ2KPTCksSVesiBXKLOt4MwKcbZL8C1ot4Ov3dYg,1540
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
4
4
  konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
@@ -7,9 +7,9 @@ konduktor/authentication.py,sha256=_mVy3eqoKohicHostFiGwG1-2ybxP-l7ouofQ0LRlCY,4
7
7
  konduktor/backends/__init__.py,sha256=1Q6sqqdeMYarpTX_U-QVywJYf7idiUTRsyP-E4BQSOw,129
8
8
  konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
9
9
  konduktor/backends/jobset.py,sha256=UdhwAuZODLMbLY51Y2zOBsh6wg4Pb84oHVvUKzx3Z2w,8434
10
- konduktor/backends/jobset_utils.py,sha256=diGpy-qpsQeVMFZVQsMgG3HxJxl2huxqbRU6FMA0QHY,21363
10
+ konduktor/backends/jobset_utils.py,sha256=DSdWdCUPdTh9EzFG0z_GoHVeCW49kLwTujaKjC1ko3I,21430
11
11
  konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
12
- konduktor/cli.py,sha256=qiTFut28crvcXOoSvnN3NcTb-xPmtFB336zdt9Q2bxU,33370
12
+ konduktor/cli.py,sha256=4eYDqSvwEn38rDNk6fKiSk4BwdmSna2XonUrI9-o7w0,33903
13
13
  konduktor/config.py,sha256=J50JxC6MsXMnlrJPXdDUMr38C89xvOO7mR8KJ6fyils,15520
14
14
  konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
15
15
  konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -68,14 +68,14 @@ konduktor/manifests/controller_deployment.yaml,sha256=6p3oSLkEVONZsvKZGqVop0Dhn4
68
68
  konduktor/manifests/dashboard_deployment.yaml,sha256=xJLd4FbPMAosI0fIv5_8y7dV9bw0Vsf81l-w4MB_aU8,2837
69
69
  konduktor/manifests/dmesg_daemonset.yaml,sha256=pSWt7YOeTYjS0l0iki1fvHOs7MhY-sH-RQfVW6JJyno,1391
70
70
  konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw1vcq9_3gwHCu75Bi0A,3939
71
- konduktor/resource.py,sha256=w2PdIrmQaJWA-GLSmVBcg4lxwuxvPulz35_YSKa5o24,19254
71
+ konduktor/resource.py,sha256=Fg4kon7jQ9xDo9Iz8Q0J8doIRmTkSwIhYXLH6jbtRO8,19610
72
72
  konduktor/task.py,sha256=ofwd8WIhfD6C3ThLcv6X3GUzQHyZ6ddjUagE-umF4K0,35207
73
- konduktor/templates/jobset.yaml.j2,sha256=onYiHtXAgk-XBtji994hPu_g0hxnLzvmfxwjbdKdeZc,960
74
- konduktor/templates/pod.yaml.j2,sha256=JvDruGpBbRHSklqNEeKSvPH0Y1uldvcylqLUaIcguuQ,16086
73
+ konduktor/templates/jobset.yaml.j2,sha256=rdURknodtgLp4zoA2PX86Nn4wPpi3tr5l4IG55aWBRg,1059
74
+ konduktor/templates/pod.yaml.j2,sha256=7KyUy4orakJ8nI7ee8AIGGY4HHfhs-6zwUlAzhekZHw,16112
75
75
  konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
76
  konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
77
77
  konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
- konduktor/utils/accelerator_registry.py,sha256=1tpVIaM0UZ3w1desPhVEwNCUruamhP-igDZrcfaoRWI,574
78
+ konduktor/utils/accelerator_registry.py,sha256=LmhLPR-N9kxrk0UBYYcF1O6ADv9HHwsn_Pc3SOYFNzs,553
79
79
  konduktor/utils/annotations.py,sha256=oy2-BLydkFt3KWkXDuaGY84d6b7iISuy4eAT9uXk0Fc,2225
80
80
  konduktor/utils/base64_utils.py,sha256=mF-Tw98mFRG70YE4w6s9feuQSCYZHOb8YatBZwMugyI,3130
81
81
  konduktor/utils/common_utils.py,sha256=4yG5Kjvu1hu6x2nKNaaCUKQNrheUaG61Qe913MFPry8,15060
@@ -87,12 +87,12 @@ konduktor/utils/kubernetes_utils.py,sha256=VG7qatUFyWHY-PCQ8fYWh2kn2TMwfg84cn-Vk
87
87
  konduktor/utils/log_utils.py,sha256=oFCKkYKCS_e_GRw_-0F7WsiIZNqJL1RZ4cD5-zh59Q4,9765
88
88
  konduktor/utils/loki_utils.py,sha256=h2ZvZQr1nE_wXXsKsGMjhG2s2MXknNd4icydTR_ruKU,3539
89
89
  konduktor/utils/rich_utils.py,sha256=ycADW6Ij3wX3uT8ou7T8qxX519RxlkJivsLvUahQaJo,3583
90
- konduktor/utils/schemas.py,sha256=2fHsTi3t9q3LXqOPrcpkmPsMbaoJBnuJstd6ULmDiUo,16455
90
+ konduktor/utils/schemas.py,sha256=VGPERAso2G4sVAznsJ80qT2Q-I_EFxXw6Rfcw-vkYgQ,16535
91
91
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
92
92
  konduktor/utils/ux_utils.py,sha256=czCwiS1bDqgeKtzAJctczpLwFZzAse7WuozdvzEFYJ4,7437
93
93
  konduktor/utils/validator.py,sha256=tgBghVyedyzGx84-U2Qfoh_cJBE3oUk9gclMW90ORks,691
94
- konduktor_nightly-0.1.0.dev20250603105033.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
95
- konduktor_nightly-0.1.0.dev20250603105033.dist-info/METADATA,sha256=oSHm7k_dhVrpXPt8R0bhBnRfGjMoyBep9nkpBMqP7Qo,4289
96
- konduktor_nightly-0.1.0.dev20250603105033.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
97
- konduktor_nightly-0.1.0.dev20250603105033.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
98
- konduktor_nightly-0.1.0.dev20250603105033.dist-info/RECORD,,
94
+ konduktor_nightly-0.1.0.dev20250605105049.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
95
+ konduktor_nightly-0.1.0.dev20250605105049.dist-info/METADATA,sha256=uZGkLvGqATI96wS6l8jrBSQo7B9x2VPBwaZw-PhDWa0,4289
96
+ konduktor_nightly-0.1.0.dev20250605105049.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
97
+ konduktor_nightly-0.1.0.dev20250605105049.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
98
+ konduktor_nightly-0.1.0.dev20250605105049.dist-info/RECORD,,