konduktor-nightly 0.1.0.dev20250821104804__tar.gz → 0.1.0.dev20250823104438__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of konduktor-nightly might be problematic. Click here for more details.

Files changed (103) hide show
  1. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/PKG-INFO +1 -1
  2. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/backends/deployment_utils.py +6 -3
  4. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/backends/pod_utils.py +0 -9
  5. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/cli.py +18 -1
  6. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/resource.py +7 -1
  7. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/templates/jobset.yaml.j2 +5 -1
  8. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/schemas.py +2 -1
  9. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/pyproject.toml +1 -1
  10. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/LICENSE +0 -0
  11. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/README.md +0 -0
  12. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/adaptors/__init__.py +0 -0
  13. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/adaptors/aws.py +0 -0
  14. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/adaptors/common.py +0 -0
  15. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/adaptors/gcp.py +0 -0
  16. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/authentication.py +0 -0
  17. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/backends/__init__.py +0 -0
  18. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/backends/backend.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/backends/constants.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/backends/deployment.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/backends/jobset.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/backends/jobset_utils.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/check.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/config.py +0 -0
  25. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/constants.py +0 -0
  26. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/controller/__init__.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/controller/constants.py +0 -0
  28. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/controller/launch.py +0 -0
  29. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/controller/node.py +0 -0
  30. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/controller/parse.py +0 -0
  31. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/README.md +0 -0
  32. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/backend/main.py +0 -0
  33. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/backend/sockets.py +0 -0
  34. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  35. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/.gitignore +0 -0
  36. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  37. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  38. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  39. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  40. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  41. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  42. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  43. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  44. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  45. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  46. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  47. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  48. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  49. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  50. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  51. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/globals.css +0 -0
  52. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  53. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/layout.js +0 -0
  54. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  55. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/app/page.js +0 -0
  56. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  57. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  58. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/package-lock.json +0 -0
  59. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/package.json +0 -0
  60. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  61. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/server.js +0 -0
  62. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  63. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/data/__init__.py +0 -0
  64. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/data/aws/__init__.py +0 -0
  65. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/data/aws/s3.py +0 -0
  66. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/data/constants.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/data/data_utils.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/data/gcp/__init__.py +0 -0
  69. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/data/gcp/constants.py +0 -0
  70. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/data/gcp/gcs.py +0 -0
  71. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/data/gcp/utils.py +0 -0
  72. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/data/registry.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/data/storage.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/data/storage_utils.py +0 -0
  75. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/execution.py +0 -0
  76. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/kube_client.py +0 -0
  77. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/logging.py +0 -0
  78. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/manifests/controller_deployment.yaml +0 -0
  79. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  80. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  81. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  82. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/serving.py +0 -0
  83. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/task.py +0 -0
  84. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/templates/deployment.yaml.j2 +0 -0
  85. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/templates/pod.yaml.j2 +0 -0
  86. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/usage/__init__.py +0 -0
  87. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/usage/constants.py +0 -0
  88. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/__init__.py +0 -0
  89. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/accelerator_registry.py +0 -0
  90. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/annotations.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/base64_utils.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/common_utils.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/constants.py +0 -0
  94. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/env_options.py +0 -0
  95. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/exceptions.py +0 -0
  96. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/kubernetes_enums.py +0 -0
  97. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/kubernetes_utils.py +0 -0
  98. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/log_utils.py +0 -0
  99. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/loki_utils.py +0 -0
  100. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/rich_utils.py +0 -0
  101. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/subprocess_utils.py +0 -0
  102. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/ux_utils.py +0 -0
  103. {konduktor_nightly-0.1.0.dev20250821104804 → konduktor_nightly-0.1.0.dev20250823104438}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250821104804
3
+ Version: 0.1.0.dev20250823104438
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = 'eee38d922bf4c7cb8a2e6e730092dde0ae372500'
14
+ _KONDUKTOR_COMMIT_SHA = 'face26aca22b99192e740dca4875261fcffa2a55'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20250821104804'
48
+ __version__ = '1.0.0.dev0.1.0.dev20250823104438'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -47,8 +47,6 @@ _DEPLOYMENT_METADATA_LABELS = {
47
47
  'model_name_label': AIBRIX_NAME_LABEL,
48
48
  }
49
49
 
50
- _RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
51
-
52
50
 
53
51
  def render_specs(
54
52
  task: 'konduktor.Task',
@@ -781,7 +779,12 @@ def show_status_table(namespace: str, all_users: bool):
781
779
  )
782
780
  if port_obj and port_obj.port:
783
781
  port_str = str(port_obj.port)
784
- endpoint_str = f'{ip_str}:{port_str}' if port_str else ip_str
782
+
783
+ # For vLLM deployments, don't append port since external routing is on port 80
784
+ if AIBRIX_NAME_LABEL in labels:
785
+ endpoint_str = ip_str
786
+ else:
787
+ endpoint_str = f'{ip_str}:{port_str}' if port_str else ip_str
785
788
 
786
789
  # Replicas
787
790
  ready_replicas = (
@@ -329,15 +329,6 @@ def inject_deployment_pod_metadata(
329
329
  if task.resources and task.resources.labels:
330
330
  pod_spec['metadata']['labels'].update(task.resources.labels)
331
331
 
332
- # Add max run duration annotation
333
- assert task.resources is not None and task.resources.labels is not None
334
- maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
335
- if not maxRunDurationSeconds:
336
- raise ValueError('maxRunDurationSeconds is required')
337
- pod_spec['metadata']['annotations'][_RUN_DURATION_ANNOTATION_KEY] = str(
338
- maxRunDurationSeconds
339
- )
340
-
341
332
  # Set restart policy for deployments
342
333
  pod_spec.setdefault('spec', {})
343
334
  pod_spec['spec']['restartPolicy'] = 'Always'
@@ -104,6 +104,7 @@ def _make_task_with_overrides(
104
104
  instance_type: Optional[str] = None,
105
105
  num_nodes: Optional[int] = None,
106
106
  max_restarts: Optional[int] = None,
107
+ completions: Optional[int] = None,
107
108
  image_id: Optional[str] = None,
108
109
  disk_size: Optional[int] = None,
109
110
  env: Optional[List[Tuple[str, str]]] = None,
@@ -166,7 +167,9 @@ def _make_task_with_overrides(
166
167
  if max_restarts is not None:
167
168
  assert task.resources is not None
168
169
  task.resources.job_config['max_restarts'] = max_restarts
169
-
170
+ if completions is not None:
171
+ assert task.resources is not None
172
+ task.resources.job_config['completions'] = completions
170
173
  if num_nodes is not None:
171
174
  task.num_nodes = num_nodes
172
175
  if name is not None:
@@ -215,6 +218,16 @@ _TASK_OPTIONS = [
215
218
  'supplied.'
216
219
  ),
217
220
  ),
221
+ click.option(
222
+ '--completions',
223
+ required=False,
224
+ type=int,
225
+ help=(
226
+ 'Number of successful completions required. Overrides YAML.'
227
+ 'Overrides the "completions" config in the YAML if both are '
228
+ 'supplied.'
229
+ ),
230
+ ),
218
231
  click.option(
219
232
  '--cpus',
220
233
  default=None,
@@ -798,6 +811,7 @@ def launch(
798
811
  memory: Optional[str],
799
812
  num_nodes: Optional[int],
800
813
  max_restarts: Optional[int],
814
+ completions: Optional[int],
801
815
  image_id: Optional[str],
802
816
  env_file: Optional[Dict[str, str]],
803
817
  env: List[Tuple[str, str]],
@@ -822,6 +836,7 @@ def launch(
822
836
  memory=memory,
823
837
  num_nodes=num_nodes,
824
838
  max_restarts=max_restarts,
839
+ completions=completions,
825
840
  image_id=image_id,
826
841
  env=env,
827
842
  disk_size=disk_size,
@@ -1686,6 +1701,7 @@ def serve_launch(
1686
1701
  memory: Optional[str],
1687
1702
  num_nodes: Optional[int],
1688
1703
  max_restarts: Optional[int],
1704
+ completions: Optional[int],
1689
1705
  image_id: Optional[str],
1690
1706
  env_file: Optional[Dict[str, str]],
1691
1707
  env: List[Tuple[str, str]],
@@ -1714,6 +1730,7 @@ def serve_launch(
1714
1730
  memory=memory,
1715
1731
  num_nodes=num_nodes,
1716
1732
  max_restarts=max_restarts,
1733
+ completions=completions,
1717
1734
  image_id=image_id,
1718
1735
  env=env,
1719
1736
  disk_size=disk_size,
@@ -387,7 +387,13 @@ class Resources:
387
387
 
388
388
  def get_completions(self) -> Optional[int]:
389
389
  value = self.job_config.get('completions')
390
- return int(value) if value is not None else None
390
+ if value is not None:
391
+ value = int(value)
392
+ if value <= 0:
393
+ with ux_utils.print_exception_no_traceback():
394
+ raise ValueError('completions must be a positive integer')
395
+ return value
396
+ return None
391
397
 
392
398
  def get_max_restarts(self) -> Optional[int]:
393
399
  value = self.job_config.get('max_restarts')
@@ -19,7 +19,7 @@ jobset:
19
19
  annotations: {}
20
20
  spec:
21
21
  ttlSecondsAfterFinished: 31536000 # 1 year (365 days)
22
- {% if max_restarts %}
22
+ {% if max_restarts is not none %}
23
23
  failurePolicy:
24
24
  maxRestarts: {{ max_restarts }}
25
25
  {% endif %}
@@ -29,7 +29,11 @@ jobset:
29
29
  spec:
30
30
  ttlSecondsAfterFinished: 600 # 5 minutes
31
31
  parallelism: {{ num_nodes }}
32
+ {% if completions %}
33
+ completions: {{ completions }}
34
+ {% else %}
32
35
  completions: {{ num_nodes }}
36
+ {% endif %}
33
37
  backoffLimit: 0
34
38
  template: {}
35
39
  podFailurePolicy:
@@ -467,7 +467,8 @@ def get_job_schema():
467
467
  'additionalProperties': False,
468
468
  'properties': {
469
469
  'completions': {
470
- 'type': 'number',
470
+ 'type': 'integer',
471
+ 'minimum': 1,
471
472
  },
472
473
  'max_restarts': {
473
474
  'type': 'integer',
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "konduktor-nightly"
3
- version = "0.1.0.dev20250821104804"
3
+ version = "0.1.0.dev20250823104438"
4
4
  description = "GPU Cluster Health Management"
5
5
  packages = [
6
6
  {include = "konduktor"}