konduktor-nightly 0.1.0.dev20251124105105__tar.gz → 0.1.0.dev20251211105235__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/PKG-INFO +2 -1
  2. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/pod_utils.py +14 -11
  4. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/cli.py +148 -185
  5. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/pyproject.toml +2 -1
  6. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/LICENSE +0 -0
  7. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/README.md +0 -0
  8. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/adaptors/__init__.py +0 -0
  9. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/adaptors/aws.py +0 -0
  10. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/adaptors/common.py +0 -0
  11. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/adaptors/gcp.py +0 -0
  12. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/authentication.py +0 -0
  13. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/__init__.py +0 -0
  14. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/backend.py +0 -0
  15. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/constants.py +0 -0
  16. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/deployment.py +0 -0
  17. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/deployment_utils.py +0 -0
  18. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/jobset.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/backends/jobset_utils.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/check.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/config.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/constants.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/__init__.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/constants.py +0 -0
  25. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/launch.py +0 -0
  26. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/node.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/controller/parse.py +0 -0
  28. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/README.md +0 -0
  29. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/backend/main.py +0 -0
  30. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/backend/sockets.py +0 -0
  31. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  32. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/.gitignore +0 -0
  33. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  34. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  35. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  36. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  37. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  38. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  39. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  40. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  41. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  42. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  43. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  44. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  45. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  46. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  47. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  48. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/globals.css +0 -0
  49. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  50. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/layout.js +0 -0
  51. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  52. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/app/page.js +0 -0
  53. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  54. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  55. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/package-lock.json +0 -0
  56. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/package.json +0 -0
  57. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  58. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/server.js +0 -0
  59. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  60. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/__init__.py +0 -0
  61. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/aws/__init__.py +0 -0
  62. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/aws/s3.py +0 -0
  63. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/constants.py +0 -0
  64. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/data_utils.py +0 -0
  65. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/gcp/__init__.py +0 -0
  66. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/gcp/constants.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/gcp/gcs.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/gcp/utils.py +0 -0
  69. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/registry.py +0 -0
  70. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/storage.py +0 -0
  71. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/data/storage_utils.py +0 -0
  72. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/execution.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/kube_client.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/logging.py +0 -0
  75. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/aibrix-setup.yaml +0 -0
  76. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/apoxy-setup.yaml +0 -0
  77. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/apoxy-setup2.yaml +0 -0
  78. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/controller_deployment.yaml +0 -0
  79. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  80. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  81. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  82. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/resource.py +0 -0
  83. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/serving.py +0 -0
  84. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/task.py +0 -0
  85. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/templates/deployment.yaml.j2 +0 -0
  86. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/templates/jobset.yaml.j2 +0 -0
  87. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/templates/pod.yaml.j2 +0 -0
  88. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/usage/__init__.py +0 -0
  89. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/usage/constants.py +0 -0
  90. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/__init__.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/accelerator_registry.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/annotations.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/base64_utils.py +0 -0
  94. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/common_utils.py +0 -0
  95. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/constants.py +0 -0
  96. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/env_options.py +0 -0
  97. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/exceptions.py +0 -0
  98. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/kubernetes_enums.py +0 -0
  99. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/kubernetes_utils.py +0 -0
  100. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/log_utils.py +0 -0
  101. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/loki_utils.py +0 -0
  102. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/rich_utils.py +0 -0
  103. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/schemas.py +0 -0
  104. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/subprocess_utils.py +0 -0
  105. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/ux_utils.py +0 -0
  106. {konduktor_nightly-0.1.0.dev20251124105105 → konduktor_nightly-0.1.0.dev20251211105235}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20251124105105
3
+ Version: 0.1.0.dev20251211105235
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -29,6 +29,7 @@ Requires-Dist: prettytable (>=3.12.0,<4.0.0)
29
29
  Requires-Dist: psutil (>=7.0.0,<8.0.0)
30
30
  Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
31
31
  Requires-Dist: rich (>=13.9.4,<14.0.0)
32
+ Requires-Dist: sniffio (>=1.3,<2.0)
32
33
  Requires-Dist: websockets (>=15.0.1,<16.0.0)
33
34
  Description-Content-Type: text/markdown
34
35
 
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = '4837817a46660c45c449eec1cf69ac90ba5b8390'
14
+ _KONDUKTOR_COMMIT_SHA = '421390595e3a1b9f263e790323deae61d94da231'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20251124105105'
48
+ __version__ = '1.0.0.dev0.1.0.dev20251211105235'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -28,6 +28,8 @@ if typing.TYPE_CHECKING:
28
28
  logger = logging.get_logger(__name__)
29
29
 
30
30
  _RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
31
+ # Use a large default (7 days) to mimic "infinite" runtime.
32
+ _DEFAULT_MAX_RUN_DURATION_SECONDS = 604800
31
33
 
32
34
 
33
35
  def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
@@ -471,20 +473,21 @@ def inject_jobset_metadata(jobset_spec: Dict[str, Any], task: 'konduktor.Task')
471
473
  jobset_spec: The JobSet spec dictionary to modify
472
474
  task: The task object containing resource information
473
475
  """
474
- # Add max run duration annotation
475
- assert (
476
- task.resources is not None and task.resources.labels is not None
477
- ), 'Task resources and task.resources.labels are required'
478
- maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
479
- if not maxRunDurationSeconds:
480
- raise ValueError('maxRunDurationSeconds is required')
481
- jobset_spec['jobset']['metadata']['annotations'][_RUN_DURATION_ANNOTATION_KEY] = (
482
- str(maxRunDurationSeconds)
476
+ assert task.resources is not None, 'Task resources are required'
477
+ labels = task.resources.labels or {}
478
+
479
+ # Add max run duration annotation, defaulting to a practically infinite value.
480
+ maxRunDurationSeconds = labels.get('maxRunDurationSeconds')
481
+ metadata = jobset_spec['jobset']['metadata']
482
+ metadata.setdefault('annotations', {})[_RUN_DURATION_ANNOTATION_KEY] = str(
483
+ maxRunDurationSeconds
484
+ if maxRunDurationSeconds is not None
485
+ else _DEFAULT_MAX_RUN_DURATION_SECONDS
483
486
  )
484
487
 
485
488
  # Inject resource labels into JobSet metadata.
486
- if task.resources and task.resources.labels:
487
- jobset_spec['jobset']['metadata']['labels'].update(task.resources.labels)
489
+ if labels:
490
+ jobset_spec['jobset']['metadata']['labels'].update(labels)
488
491
 
489
492
 
490
493
  def merge_pod_into_jobset_template(
@@ -274,22 +274,20 @@ _TASK_OPTIONS = [
274
274
  '--env-file',
275
275
  required=False,
276
276
  type=dotenv.dotenv_values,
277
- help="""\
278
- Path to a dotenv file with environment variables to set on the remote
279
- node.
280
-
281
- If any values from ``--env-file`` conflict with values set by
282
- ``--env``, the ``--env`` value will be preferred.""",
277
+ help=(
278
+ 'Path to a dotenv file with environment variables to set on the '
279
+ 'remote node. If any values from ``--env-file`` conflict '
280
+ 'with values set by ``--env``, the ``--env`` value will '
281
+ 'be preferred.'
282
+ ),
283
283
  ),
284
284
  click.option(
285
285
  '--env',
286
286
  required=False,
287
287
  type=_parse_env_var,
288
288
  multiple=True,
289
- help="""\
290
- Environment variable to set on the remote node.
291
- It can be specified multiple times.
292
- Examples:
289
+ help="""\\
290
+ Environment variable to set on the remote node. It can be specified multiple times:
293
291
 
294
292
  \b
295
293
  1. ``--env MY_ENV=1``: set ``$MY_ENV`` on the cluster to be 1.
@@ -299,7 +297,7 @@ _TASK_OPTIONS = [
299
297
  is run.
300
298
 
301
299
  3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the
302
- same value of ``$MY_ENV3`` in the local environment.""",
300
+ same value of ``$MY_ENV3`` in the local environment.""", # noqa: E501,
303
301
  ),
304
302
  ]
305
303
  _TASK_OPTIONS_WITH_NAME = [
@@ -321,10 +319,10 @@ _EXTRA_RESOURCES_OPTIONS = [
321
319
  type=str,
322
320
  help=(
323
321
  'Type and number of GPUs to use. Example values: '
324
- '"V100:8", "V100" (short for a count of 1)'
322
+ '"V100:8", "V100" (short for a count of 1) '
325
323
  'If a new cluster is being launched by this command, this is the '
326
- 'resources to provision. If an existing cluster is being reused, this'
327
- " is seen as the task demand, which must fit the cluster's total "
324
+ 'resources to provision. If an existing cluster is being reused, this '
325
+ "is seen as the task demand, which must fit the cluster's total "
328
326
  'resources and is used for scheduling the task. '
329
327
  'Overrides the "accelerators" '
330
328
  'config in the YAML if both are supplied. '
@@ -661,19 +659,19 @@ def status(
661
659
 
662
660
  \b
663
661
  Examples:
664
- konduktor status --limit 10
665
- konduktor status --before "08/06/25 03:53PM"
666
- konduktor status --all-users --limit 10 --after "08/06/25 03:53PM"
662
+ konduktor status --limit 10
663
+ konduktor status --before "08/06/25 03:53PM"
664
+ konduktor status --all-users --limit 10 --after "08/06/25 03:53PM"
667
665
 
668
666
  \b
669
667
  Notes:
670
- • When using --before or --after timestamps, "08/06/25"
671
- is equivalent to "08/06/25 00:00".
672
- • "03:53PM" is equivalent to "03:53:00PM".
673
- • Timestamps shown in "konduktor status" are truncated
674
- and are in the local timezone.
675
- Example: "03:53:55PM" → "03:53PM" would show up in
676
- --after "03:53PM" but not in --before "03:53PM".
668
+ • When using --before or --after timestamps, "08/06/25" is
669
+ equivalent to "08/06/25 00:00".
670
+ • "03:53PM" is equivalent to "03:53:00PM".
671
+ • Timestamps shown in "konduktor status" are truncated and are in
672
+ the local timezone.
673
+ Example: "03:53:55PM" → "03:53PM" would show up in --after "03:53PM"
674
+ but not in --before "03:53PM".
677
675
  """
678
676
  context = kubernetes_utils.get_current_kube_config_context_name()
679
677
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
@@ -691,8 +689,8 @@ def status(
691
689
  is_flag=True,
692
690
  default=False,
693
691
  help=(
694
- 'If specified, do not show logs but exit with a status code for the '
695
- "job's status: 0 for succeeded, or 1 for all other statuses."
692
+ '[DEPRECATED] If specified, do not show logs but exit with a status code '
693
+ "for the job's status: 0 for succeeded, or 1 for all other statuses."
696
694
  ),
697
695
  )
698
696
  @click.option(
@@ -702,7 +700,7 @@ def status(
702
700
  help=(
703
701
  'Follow the logs of a job. '
704
702
  'If --no-follow is specified, print the log so far and exit. '
705
- '[default: --follow]'
703
+ '(default: --follow)'
706
704
  ),
707
705
  )
708
706
  @click.option(
@@ -732,7 +730,7 @@ def status(
732
730
  default='1h',
733
731
  help=(
734
732
  'Choose how much time from now to look back in logs. '
735
- 'Examples: 30s, 5m, 2h, 1d. Default is 1h.'
733
+ 'Examples: 30s, 5m, 2h, 1d. Default is 1h. '
736
734
  'Note: currently only applies when streaming (default --follow). '
737
735
  'With --no-follow, all available logs are returned.'
738
736
  ),
@@ -866,8 +864,10 @@ def launch(
866
864
  ):
867
865
  """Launch a task.
868
866
 
869
- If ENTRYPOINT points to a valid YAML file, it is read in as the task
870
- specification. Otherwise, it is interpreted as a bash command.
867
+ \b
868
+ Notes:
869
+ • If ENTRYPOINT points to a valid YAML file, it is read in as the task
870
+ specification. Otherwise, it is interpreted as a bash command.
871
871
  """
872
872
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
873
873
  env = _merge_env_vars(env_file, env)
@@ -944,6 +944,7 @@ def _find_matching_jobs(
944
944
  jobs_response: Dict[str, Any],
945
945
  namespace: str,
946
946
  all_users: Optional[bool],
947
+ all_flag: Optional[bool] = None,
947
948
  ):
948
949
  """
949
950
  Find all jobs matching against the user specified pattern.
@@ -956,19 +957,21 @@ def _find_matching_jobs(
956
957
 
957
958
  jobs_specs = [job for job in jobs_response['items']]
958
959
 
959
- if all_users:
960
+ if all_flag:
960
961
  assert jobs_specs is not None, f'No jobs found in namespace {namespace}'
961
962
  assert len(jobs_specs) > 0, f'No jobs found in namespace {namespace}'
962
- jobs = [
963
- job['metadata']['name']
964
- for job in jobs_specs
965
- if job['metadata']['labels'][backend_constants.USER_LABEL]
966
- == common_utils.get_cleaned_username()
967
- ]
968
- logger.debug(
969
- f'Jobs found for user {colorama.Style.BRIGHT}{colorama.Fore.CYAN}'
970
- f'{common_utils.get_cleaned_username()}{colorama.Style.RESET_ALL}: {jobs}'
971
- )
963
+ if all_users:
964
+ # --all with --all-users = all jobs of all users
965
+ jobs = [job['metadata']['name'] for job in jobs_specs]
966
+ else:
967
+ # --all without --all-users = all jobs of current user
968
+ jobs = [
969
+ job['metadata']['name']
970
+ for job in jobs_specs
971
+ if job['metadata']['labels'][backend_constants.USER_LABEL]
972
+ == common_utils.get_cleaned_username()
973
+ ]
974
+ return jobs
972
975
  elif jobs:
973
976
  # Get all available jobs to match against patterns
974
977
  if len(jobs_specs) == 0:
@@ -1070,36 +1073,33 @@ def down(
1070
1073
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1071
1074
  """Tear down job(s).
1072
1075
 
1073
- JOB is the name of the job to tear down. If both
1074
- JOB and ``--all`` are supplied, the latter takes precedence.
1075
-
1076
- Tearing down a job will delete all associated containers (all billing
1077
- stops), and any data on the containers disks will be lost. Accelerators
1078
- (e.g., GPUs) that are part of the job will be deleted too.
1079
-
1080
- Wildcard patterns are supported using * characters.
1081
- Examples: "test-*" matches all jobs starting with "test-",
1082
- "*-gpu" matches all jobs ending with "-gpu".
1083
-
1076
+ \b
1084
1077
  Examples:
1078
+ # Tear down a specific job.
1079
+ konduktor down my_job
1080
+ \b
1081
+ # Tear down multiple jobs.
1082
+ konduktor down my_job1 my_job2
1083
+ \b
1084
+ # Tear down all jobs matching a pattern.
1085
+ konduktor down "my_job-*"
1086
+ \b
1087
+ # Tear down all of this users jobs.
1088
+ konduktor down -a
1089
+ konduktor down --all
1090
+ \b
1091
+ # Tear down all jobs across all users
1092
+ konduktor down --all --all-users
1085
1093
 
1086
- .. code-block:: bash
1087
-
1088
- # Tear down a specific job.
1089
- konduktor down cluster_name
1090
- \b
1091
- # Tear down multiple jobs.
1092
- konduktor down job1 job2
1093
- \b
1094
- # Tear down all jobs matching a pattern.
1095
- konduktor down "test-*"
1096
- \b
1097
- # Tear down all of this users jobs.
1098
- konduktor down -a
1099
- konduktor down --all
1100
-
1101
- # Tear down all jobs across all users
1102
- konduktor down --all --all-users
1094
+ \b
1095
+ Notes:
1096
+ If both JOB and ``--all`` are supplied, the latter takes precedence.
1097
+ Tearing down a job will delete all associated containers (all billing
1098
+ stops), and any data on the containers disks will be lost. Accelerators
1099
+ (e.g., GPUs) that are part of the job will be deleted too.
1100
+ Wildcard patterns are supported using * characters.
1101
+ Ex: "test-*" matches all jobs starting with "test-",
1102
+ "*-gpu" matches all jobs ending with "-gpu".
1103
1103
 
1104
1104
  """
1105
1105
 
@@ -1107,7 +1107,7 @@ def down(
1107
1107
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1108
1108
  jobs_response = jobset_utils.list_jobset(namespace)
1109
1109
  assert jobs_response
1110
- filtered_jobs = _find_matching_jobs(jobs, jobs_response, namespace, all_users)
1110
+ filtered_jobs = _find_matching_jobs(jobs, jobs_response, namespace, all_users, all)
1111
1111
 
1112
1112
  if not yes:
1113
1113
  # Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
@@ -1154,46 +1154,41 @@ def stop(
1154
1154
  ):
1155
1155
  """Suspend job(s) (manual/user-initiated).
1156
1156
 
1157
- JOB is the name of the job to suspend. If both
1158
- JOB and ``--all`` are supplied, the latter takes precedence.
1159
-
1160
- Suspending a job will pause execution and mark the job as SUSPENDED (by user).
1161
- The job can be resumed later with `konduktor start`.
1162
-
1163
- If a job is suspended by the system (e.g., due to queueing),
1164
- it will show as SUSPENDED (by system).
1165
-
1166
- Wildcard patterns are supported using * characters.
1167
- Examples: "my_job-*" matches all jobs starting with "my_job-",
1168
- "*-gpu" matches all jobs ending with "-gpu".
1169
-
1157
+ \b
1170
1158
  Examples:
1159
+ # Suspend a specific job.
1160
+ konduktor stop my_job
1161
+ \b
1162
+ # Suspend multiple jobs.
1163
+ konduktor stop my_job1 my_job2
1164
+ \b
1165
+ # Suspend all jobs matching a pattern.
1166
+ konduktor stop "my_job-*"
1167
+ \b
1168
+ # Suspend all of this users jobs.
1169
+ konduktor stop -a
1170
+ konduktor stop --all
1171
+ \b
1172
+ # Suspend all jobs across all users
1173
+ konduktor stop --all --all-users
1171
1174
 
1172
- .. code-block:: bash
1173
-
1174
- # Suspend a specific job.
1175
- konduktor stop my_job
1176
- \b
1177
- # Suspend multiple jobs.
1178
- konduktor stop my_job1 my_job2
1179
- \b
1180
- # Suspend all jobs matching a pattern.
1181
- konduktor stop "my_job-*"
1182
- \b
1183
- # Suspend all of this users jobs.
1184
- konduktor stop -a
1185
- konduktor stop --all
1186
-
1187
- # Suspend all jobs across all users
1188
- konduktor stop --all --all-users
1189
-
1175
+ \b
1176
+ Notes:
1177
+ If both JOB and ``--all`` are supplied, the latter takes precedence.
1178
+ Suspending a job will pause execution and mark the job as SUSPENDED (by user).
1179
+ The job can be resumed later with `konduktor start`.
1180
+ If a job is suspended by the system (e.g., due to queueing), it
1181
+ will show as SUSPENDED (by system).
1182
+ • Wildcard patterns are supported using * characters.
1183
+ Ex: "test-*" matches all jobs starting with "test-",
1184
+ "*-gpu" matches all jobs ending with "-gpu".
1190
1185
  """
1191
1186
 
1192
1187
  context = kubernetes_utils.get_current_kube_config_context_name()
1193
1188
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
1194
1189
  jobs_response = jobset_utils.list_jobset(namespace)
1195
1190
  assert jobs_response
1196
- filtered_jobs = _find_matching_jobs(jobs, jobs_response, namespace, all_users)
1191
+ filtered_jobs = _find_matching_jobs(jobs, jobs_response, namespace, all_users, all)
1197
1192
 
1198
1193
  if not yes:
1199
1194
  # Prompt for confirmation
@@ -1249,39 +1244,34 @@ def start(
1249
1244
  ):
1250
1245
  """Resume suspended job(s) (manual/user-initiated).
1251
1246
 
1252
- JOB is the name of the job to resume. If both
1253
- JOB and ``--all`` are supplied, the latter takes precedence.
1254
-
1255
- Resuming a job will restart execution from where it was suspended.
1256
- Only suspended jobs can be resumed.
1257
-
1258
- This command works for both manually suspended jobs (SUSPENDED by user)
1259
- and system-suspended jobs (SUSPENDED by system).
1260
-
1261
- Wildcard patterns are supported using * characters.
1262
- Examples: "my_job-*" matches all jobs starting with "my_job-",
1263
- "*-gpu" matches all jobs ending with "-gpu".
1264
-
1247
+ \b
1265
1248
  Examples:
1249
+ # Resume a specific job.
1250
+ konduktor start my_job
1251
+ \b
1252
+ # Resume multiple jobs.
1253
+ konduktor start my_job1 my_job2
1254
+ \b
1255
+ # Resume all jobs matching a pattern.
1256
+ konduktor start "my_job-*"
1257
+ \b
1258
+ # Resume all of this users suspended jobs.
1259
+ konduktor start -a
1260
+ konduktor start --all
1261
+ \b
1262
+ # Resume all suspended jobs across all users
1263
+ konduktor start --all --all-users
1266
1264
 
1267
- .. code-block:: bash
1268
-
1269
- # Resume a specific job.
1270
- konduktor start my_job
1271
- \b
1272
- # Resume multiple jobs.
1273
- konduktor start my_job1 my_job2
1274
- \b
1275
- # Resume all jobs matching a pattern.
1276
- konduktor start "my_job-*"
1277
- \b
1278
- # Resume all of this users suspended jobs.
1279
- konduktor start -a
1280
- konduktor start --all
1281
-
1282
- # Resume all suspended jobs across all users
1283
- konduktor start --all --all-users
1284
-
1265
+ \b
1266
+ Notes:
1267
+ If both JOB and ``--all`` are supplied, the latter takes precedence.
1268
+ Resuming a job will restart execution from where it was suspended.
1269
+ Only suspended jobs can be resumed.
1270
+ This command works for both manually suspended jobs (SUSPENDED by user)
1271
+ and system-suspended jobs (SUSPENDED by system).
1272
+ • Wildcard patterns are supported using * characters.
1273
+ Ex: "test-*" matches all jobs starting with "test-",
1274
+ "*-gpu" matches all jobs ending with "-gpu".
1285
1275
  """
1286
1276
 
1287
1277
  context = kubernetes_utils.get_current_kube_config_context_name()
@@ -1364,24 +1354,22 @@ def start(
1364
1354
  nargs=-1,
1365
1355
  )
1366
1356
  def check(clouds: Tuple[str]):
1367
- """Check which clouds are available to use for storage
1368
-
1369
- This checks storage credentials for a cloud supported by konduktor. If a
1370
- cloud is detected to be inaccessible, the reason and correction steps will
1371
- be shown.
1372
-
1373
- If CLOUDS are specified, checks credentials for only those clouds.
1374
-
1375
- The enabled clouds are cached and form the "search space" to be considered
1376
- for each task.
1357
+ """Check which clouds are available to use for storage with Konduktor
1377
1358
 
1359
+ \b
1378
1360
  Examples:
1361
+ # Check only specific clouds - gs, s3.
1362
+ konduktor check gs
1363
+ konduktor check s3
1379
1364
 
1380
- .. code-block:: bash
1381
-
1382
- # Check only specific clouds - gs, s3.
1383
- konduktor check gs
1384
- konduktor check s3
1365
+ \b
1366
+ Notes:
1367
+ This checks storage credentials for a cloud supported by konduktor.
1368
+ If a cloud is detected to be inaccessible, the reason and correction
1369
+ steps will be shown.
1370
+ • If CLOUDS are specified, checks credentials for only those clouds.
1371
+ • The enabled clouds are cached and form the "search space" to
1372
+ be considered for each task.
1385
1373
  """
1386
1374
  clouds_arg = clouds if len(clouds) > 0 else None
1387
1375
  konduktor_check.check(clouds=clouds_arg)
@@ -1430,23 +1418,12 @@ def secret():
1430
1418
 
1431
1419
  USAGE: konduktor secret COMMAND
1432
1420
 
1433
- \b
1434
- Use one of the following COMMANDS:
1435
- create [FLAGS] [NAME]
1436
- delete [NAME]
1437
- list [FLAGS]
1438
-
1439
1421
  \b
1440
1422
  Examples:
1441
- konduktor secret create --kind git-ssh --from-file=~/.ssh/id_rsa my-ssh-name
1442
- konduktor secret create --kind env --inline FOO=bar my-env-name
1443
- konduktor delete my-ssh-name
1444
- konduktor secret list
1445
-
1446
- \b
1447
- For details on COMMAND ARGS:
1448
- konduktor secret create -h
1449
- konduktor secret list -h
1423
+ konduktor secret create --kind git-ssh --from-file ~/.ssh/id_rsa my-ssh-name
1424
+ konduktor secret create --kind env --inline FOO=bar my-env-name
1425
+ konduktor secret delete my-ssh-name
1426
+ konduktor secret list
1450
1427
  """
1451
1428
 
1452
1429
 
@@ -1637,8 +1614,7 @@ def delete(name):
1637
1614
  help='Show all secrets, including those not owned by the current user.',
1638
1615
  )
1639
1616
  def list_secrets(all_users: bool):
1640
- """List secrets in the namespace.
1641
- Defaults to only your secrets unless --all-users is set."""
1617
+ """List secrets in the namespace."""
1642
1618
 
1643
1619
  context = kubernetes_utils.get_current_kube_config_context_name()
1644
1620
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
@@ -1683,23 +1659,11 @@ def serve():
1683
1659
 
1684
1660
  USAGE: konduktor serve COMMAND
1685
1661
 
1686
- \b
1687
- Use one of the following COMMANDS:
1688
- launch
1689
- down
1690
- status
1691
-
1692
1662
  \b
1693
1663
  Examples:
1694
1664
  konduktor serve launch my-deployment
1695
1665
  konduktor serve down my-deployment
1696
1666
  konduktor serve status
1697
-
1698
- \b
1699
- For details on COMMAND ARGS:
1700
- konduktor serve launch -h
1701
- konduktor serve down -h
1702
- konduktor serve status -h
1703
1667
  """
1704
1668
  pass
1705
1669
 
@@ -1774,8 +1738,10 @@ def serve_launch(
1774
1738
  ):
1775
1739
  """Launch a deployment to serve.
1776
1740
 
1777
- If ENTRYPOINT points to a valid YAML file, it is read in as the task
1778
- specification. Otherwise, it is interpreted as a bash command.
1741
+ \b
1742
+ Notes:
1743
+ • If ENTRYPOINT points to a valid YAML file, it is read in as the task
1744
+ specification. Otherwise, it is interpreted as a bash command.
1779
1745
  """
1780
1746
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1781
1747
  env = _merge_env_vars(env_file, env)
@@ -1857,13 +1823,10 @@ def serve_down(
1857
1823
  ):
1858
1824
  """Tear down deployments (Deployment, Service, PodAutoscaler).
1859
1825
 
1860
- Use --all or -a to tear down all deployments.
1861
-
1862
- Examples:
1863
-
1864
1826
  \b
1865
- konduktor serve down my-deployment
1866
- konduktor serve down -a
1827
+ Examples:
1828
+ konduktor serve down my-deployment
1829
+ konduktor serve down -a
1867
1830
  """
1868
1831
  context = kubernetes_utils.get_current_kube_config_context_name()
1869
1832
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "konduktor-nightly"
3
- version = "0.1.0.dev20251124105105"
3
+ version = "0.1.0.dev20251211105235"
4
4
  description = "GPU Cluster Health Management"
5
5
  packages = [
6
6
  {include = "konduktor"}
@@ -28,6 +28,7 @@ filelock = "^3.18.0"
28
28
  boto3 = { version = "^1.34.84", optional = true, extras = ["s3"] }
29
29
  botocore = { version = "^1.34.84", optional = true, extras = ["s3"] }
30
30
  awscli = { version = "^1.32.84", optional = true, extras = ["s3"] }
31
+ sniffio = "^1.3"
31
32
 
32
33
  [tool.poetry.extras]
33
34
  s3 = ["boto3", "botocore", "awscli"]