konduktor-nightly 0.1.0.dev20250821104804__py3-none-any.whl → 0.1.0.dev20250822104721__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of konduktor-nightly might be problematic. Click here for more details.

konduktor/__init__.py CHANGED
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = 'eee38d922bf4c7cb8a2e6e730092dde0ae372500'
14
+ _KONDUKTOR_COMMIT_SHA = 'face26aca22b99192e740dca4875261fcffa2a55'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20250821104804'
48
+ __version__ = '1.0.0.dev0.1.0.dev20250822104721'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -47,8 +47,6 @@ _DEPLOYMENT_METADATA_LABELS = {
47
47
  'model_name_label': AIBRIX_NAME_LABEL,
48
48
  }
49
49
 
50
- _RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
51
-
52
50
 
53
51
  def render_specs(
54
52
  task: 'konduktor.Task',
@@ -781,7 +779,12 @@ def show_status_table(namespace: str, all_users: bool):
781
779
  )
782
780
  if port_obj and port_obj.port:
783
781
  port_str = str(port_obj.port)
784
- endpoint_str = f'{ip_str}:{port_str}' if port_str else ip_str
782
+
783
+ # For vLLM deployments, don't append port since external routing is on port 80
784
+ if AIBRIX_NAME_LABEL in labels:
785
+ endpoint_str = ip_str
786
+ else:
787
+ endpoint_str = f'{ip_str}:{port_str}' if port_str else ip_str
785
788
 
786
789
  # Replicas
787
790
  ready_replicas = (
@@ -329,15 +329,6 @@ def inject_deployment_pod_metadata(
329
329
  if task.resources and task.resources.labels:
330
330
  pod_spec['metadata']['labels'].update(task.resources.labels)
331
331
 
332
- # Add max run duration annotation
333
- assert task.resources is not None and task.resources.labels is not None
334
- maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
335
- if not maxRunDurationSeconds:
336
- raise ValueError('maxRunDurationSeconds is required')
337
- pod_spec['metadata']['annotations'][_RUN_DURATION_ANNOTATION_KEY] = str(
338
- maxRunDurationSeconds
339
- )
340
-
341
332
  # Set restart policy for deployments
342
333
  pod_spec.setdefault('spec', {})
343
334
  pod_spec['spec']['restartPolicy'] = 'Always'
konduktor/cli.py CHANGED
@@ -104,6 +104,7 @@ def _make_task_with_overrides(
104
104
  instance_type: Optional[str] = None,
105
105
  num_nodes: Optional[int] = None,
106
106
  max_restarts: Optional[int] = None,
107
+ completions: Optional[int] = None,
107
108
  image_id: Optional[str] = None,
108
109
  disk_size: Optional[int] = None,
109
110
  env: Optional[List[Tuple[str, str]]] = None,
@@ -166,7 +167,9 @@ def _make_task_with_overrides(
166
167
  if max_restarts is not None:
167
168
  assert task.resources is not None
168
169
  task.resources.job_config['max_restarts'] = max_restarts
169
-
170
+ if completions is not None:
171
+ assert task.resources is not None
172
+ task.resources.job_config['completions'] = completions
170
173
  if num_nodes is not None:
171
174
  task.num_nodes = num_nodes
172
175
  if name is not None:
@@ -215,6 +218,16 @@ _TASK_OPTIONS = [
215
218
  'supplied.'
216
219
  ),
217
220
  ),
221
+ click.option(
222
+ '--completions',
223
+ required=False,
224
+ type=int,
225
+ help=(
226
+ 'Number of successful completions required. Overrides YAML.'
227
+ 'Overrides the "completions" config in the YAML if both are '
228
+ 'supplied.'
229
+ ),
230
+ ),
218
231
  click.option(
219
232
  '--cpus',
220
233
  default=None,
@@ -798,6 +811,7 @@ def launch(
798
811
  memory: Optional[str],
799
812
  num_nodes: Optional[int],
800
813
  max_restarts: Optional[int],
814
+ completions: Optional[int],
801
815
  image_id: Optional[str],
802
816
  env_file: Optional[Dict[str, str]],
803
817
  env: List[Tuple[str, str]],
@@ -822,6 +836,7 @@ def launch(
822
836
  memory=memory,
823
837
  num_nodes=num_nodes,
824
838
  max_restarts=max_restarts,
839
+ completions=completions,
825
840
  image_id=image_id,
826
841
  env=env,
827
842
  disk_size=disk_size,
@@ -1686,6 +1701,7 @@ def serve_launch(
1686
1701
  memory: Optional[str],
1687
1702
  num_nodes: Optional[int],
1688
1703
  max_restarts: Optional[int],
1704
+ completions: Optional[int],
1689
1705
  image_id: Optional[str],
1690
1706
  env_file: Optional[Dict[str, str]],
1691
1707
  env: List[Tuple[str, str]],
@@ -1714,6 +1730,7 @@ def serve_launch(
1714
1730
  memory=memory,
1715
1731
  num_nodes=num_nodes,
1716
1732
  max_restarts=max_restarts,
1733
+ completions=completions,
1717
1734
  image_id=image_id,
1718
1735
  env=env,
1719
1736
  disk_size=disk_size,
konduktor/resource.py CHANGED
@@ -387,7 +387,13 @@ class Resources:
387
387
 
388
388
  def get_completions(self) -> Optional[int]:
389
389
  value = self.job_config.get('completions')
390
- return int(value) if value is not None else None
390
+ if value is not None:
391
+ value = int(value)
392
+ if value <= 0:
393
+ with ux_utils.print_exception_no_traceback():
394
+ raise ValueError('completions must be a positive integer')
395
+ return value
396
+ return None
391
397
 
392
398
  def get_max_restarts(self) -> Optional[int]:
393
399
  value = self.job_config.get('max_restarts')
@@ -19,7 +19,7 @@ jobset:
19
19
  annotations: {}
20
20
  spec:
21
21
  ttlSecondsAfterFinished: 31536000 # 1 year (365 days)
22
- {% if max_restarts %}
22
+ {% if max_restarts is not none %}
23
23
  failurePolicy:
24
24
  maxRestarts: {{ max_restarts }}
25
25
  {% endif %}
@@ -29,7 +29,11 @@ jobset:
29
29
  spec:
30
30
  ttlSecondsAfterFinished: 600 # 5 minutes
31
31
  parallelism: {{ num_nodes }}
32
+ {% if completions %}
33
+ completions: {{ completions }}
34
+ {% else %}
32
35
  completions: {{ num_nodes }}
36
+ {% endif %}
33
37
  backoffLimit: 0
34
38
  template: {}
35
39
  podFailurePolicy:
@@ -467,7 +467,8 @@ def get_job_schema():
467
467
  'additionalProperties': False,
468
468
  'properties': {
469
469
  'completions': {
470
- 'type': 'number',
470
+ 'type': 'integer',
471
+ 'minimum': 1,
471
472
  },
472
473
  'max_restarts': {
473
474
  'type': 'integer',
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250821104804
3
+ Version: 0.1.0.dev20250822104721
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -1,4 +1,4 @@
1
- konduktor/__init__.py,sha256=vSTzrg0X_r9sPmZDztRtaQbXDhuOpoRTQWuv4z1wybA,1574
1
+ konduktor/__init__.py,sha256=cRGSzBwIktM510ZBH1rt7DLOACBZjXpolNWjtbpVgR4,1574
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
4
4
  konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
@@ -8,12 +8,12 @@ konduktor/backends/__init__.py,sha256=usWJ8HdZJEyg7MIsN8Zcz9rk9e2Lq5dWJ8dv6hCN3y
8
8
  konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
9
9
  konduktor/backends/constants.py,sha256=nt9G9AmFCOMwO4GuKgRQSzJJuKapOmaROp4_Y0tMF5A,732
10
10
  konduktor/backends/deployment.py,sha256=EHfB2uLeKFQ3maek9tx6XL4_sjQ-ax59DZA79Q3EkVs,5519
11
- konduktor/backends/deployment_utils.py,sha256=VGuL01rKe7p7PoVRI_cP4tiZRxHZ13nnTMG-bmDf7P0,28975
11
+ konduktor/backends/deployment_utils.py,sha256=Z58o9I3XDxqQpssZ3j-31n0sRRYo64TV-KNkX44Admc,29084
12
12
  konduktor/backends/jobset.py,sha256=OwgDog9nH-FoUmNU_H--C3U5jx70reTKL1l849M1k5A,8430
13
13
  konduktor/backends/jobset_utils.py,sha256=8YUIFg7mi33A5p_K9GhVdjllTdXRMj_GEg-iQ5Nj_iU,25708
14
- konduktor/backends/pod_utils.py,sha256=EkoyN4mMfnrwEG656J1kxPtNZQClo4X7JEyzV1F8oHg,15294
14
+ konduktor/backends/pod_utils.py,sha256=KP_PAgsdNHFgt4Od-5gAtpifAKIL7DMBg7NJ44uqikg,14885
15
15
  konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
16
- konduktor/cli.py,sha256=YW9vNlbayDIcHk5D8-qsVGwoOdRV1o9kFmF2jtBf3vE,56641
16
+ konduktor/cli.py,sha256=zdcdqS0avYzxTGDuE040vt4s2IgC23uG3aVRmOK3YOY,57235
17
17
  konduktor/config.py,sha256=9upqgCCYvcu6fKw7tovEYC1MWTkAAir0_WHPdayylbI,15536
18
18
  konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
19
19
  konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -72,11 +72,11 @@ konduktor/manifests/controller_deployment.yaml,sha256=6p3oSLkEVONZsvKZGqVop0Dhn4
72
72
  konduktor/manifests/dashboard_deployment.yaml,sha256=xJLd4FbPMAosI0fIv5_8y7dV9bw0Vsf81l-w4MB_aU8,2837
73
73
  konduktor/manifests/dmesg_daemonset.yaml,sha256=pSWt7YOeTYjS0l0iki1fvHOs7MhY-sH-RQfVW6JJyno,1391
74
74
  konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw1vcq9_3gwHCu75Bi0A,3939
75
- konduktor/resource.py,sha256=qQhMlI6gvTaoGfYb9NNgSrUavgNqfcYVfb9V_oC5pLE,20411
75
+ konduktor/resource.py,sha256=epOkJngNGtSBvMqiQeO8j0C7g0PzYhwjTMbG9VapvyQ,20628
76
76
  konduktor/serving.py,sha256=sh8TPAUXg23Bkt0ByatIMdxFFqzRm18HJTEkt3wHzdo,5147
77
77
  konduktor/task.py,sha256=97iLCo62qpN9wLGNPeFw64E8k1nch7AyySY3BUXHPWY,37496
78
78
  konduktor/templates/deployment.yaml.j2,sha256=uXFjDQaimbpFdAn2RJGaIvS_PzDY136cw_L3QMjz3ZA,3452
79
- konduktor/templates/jobset.yaml.j2,sha256=gURWl6uQv_OLni-LFy2E7ttjGOtuRDt5Vfs4ALH7fpI,1196
79
+ konduktor/templates/jobset.yaml.j2,sha256=NQcVeRNsTLLmTnJRnkL1vr45mSeth-b11YShXn_RoSg,1323
80
80
  konduktor/templates/pod.yaml.j2,sha256=owid7Wpo-kmQ-Gvx1zVoyG_feOiKviGDsnWLTlReXPo,19086
81
81
  konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
82
  konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
@@ -93,12 +93,12 @@ konduktor/utils/kubernetes_utils.py,sha256=7RThCOiyaALRqbwHZ40qMnBsbAgt669k0NHkx
93
93
  konduktor/utils/log_utils.py,sha256=k43eGpSwIdGhNotC8w7_Hq-bbyQQTrHwMOAMct_gr9M,16978
94
94
  konduktor/utils/loki_utils.py,sha256=h2ZvZQr1nE_wXXsKsGMjhG2s2MXknNd4icydTR_ruKU,3539
95
95
  konduktor/utils/rich_utils.py,sha256=ycADW6Ij3wX3uT8ou7T8qxX519RxlkJivsLvUahQaJo,3583
96
- konduktor/utils/schemas.py,sha256=tBrKhnkfn9uKDYdlb4L2KgooW-muuhww7U8fu9zX-ms,18336
96
+ konduktor/utils/schemas.py,sha256=M4O2uISyIYYFCrdQ_b2RKQNFoo81YG-GQdmOLOiWU14,18367
97
97
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
98
98
  konduktor/utils/ux_utils.py,sha256=7-Lt3QbDVvBQUli5_U9lOdXKeC-ip8rZBpO9gQ6vPJw,7955
99
99
  konduktor/utils/validator.py,sha256=5C1kE57Eyj1OPnAbvojqMNHHtf5fnl47FK_vEttd8aw,4331
100
- konduktor_nightly-0.1.0.dev20250821104804.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
101
- konduktor_nightly-0.1.0.dev20250821104804.dist-info/METADATA,sha256=qQPp3F_j37Mx4Ios1of0b_961SL9jkOJd-5sZ-_mFwE,4247
102
- konduktor_nightly-0.1.0.dev20250821104804.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
103
- konduktor_nightly-0.1.0.dev20250821104804.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
104
- konduktor_nightly-0.1.0.dev20250821104804.dist-info/RECORD,,
100
+ konduktor_nightly-0.1.0.dev20250822104721.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
101
+ konduktor_nightly-0.1.0.dev20250822104721.dist-info/METADATA,sha256=VG5_2yweBen0IWkOhw4PQWmuSbQ9-aYQ33IsxwyKRDE,4247
102
+ konduktor_nightly-0.1.0.dev20250822104721.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
103
+ konduktor_nightly-0.1.0.dev20250822104721.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
104
+ konduktor_nightly-0.1.0.dev20250822104721.dist-info/RECORD,,