konduktor-nightly 0.1.0.dev20250810104857__tar.gz → 0.1.0.dev20250812105102__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of konduktor-nightly might be problematic. Click here for more details.

Files changed (103) hide show
  1. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/PKG-INFO +1 -1
  2. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/constants.py +1 -0
  4. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/jobset.py +3 -2
  5. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/jobset_utils.py +45 -24
  6. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/cli.py +25 -15
  7. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/logging.py +6 -4
  8. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/templates/jobset.yaml.j2 +3 -0
  9. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/log_utils.py +29 -22
  10. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/ux_utils.py +25 -11
  11. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/pyproject.toml +1 -1
  12. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/LICENSE +0 -0
  13. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/README.md +0 -0
  14. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/adaptors/__init__.py +0 -0
  15. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/adaptors/aws.py +0 -0
  16. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/adaptors/common.py +0 -0
  17. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/adaptors/gcp.py +0 -0
  18. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/authentication.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/__init__.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/backend.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/deployment.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/deployment_utils.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/backends/pod_utils.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/check.py +0 -0
  25. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/config.py +0 -0
  26. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/constants.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/controller/__init__.py +0 -0
  28. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/controller/constants.py +0 -0
  29. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/controller/launch.py +0 -0
  30. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/controller/node.py +0 -0
  31. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/controller/parse.py +0 -0
  32. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/README.md +0 -0
  33. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/backend/main.py +0 -0
  34. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/backend/sockets.py +0 -0
  35. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  36. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/.gitignore +0 -0
  37. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  38. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  39. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  40. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  41. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  42. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  43. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  44. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  45. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  46. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  47. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  48. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  49. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  50. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  51. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  52. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/globals.css +0 -0
  53. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  54. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/layout.js +0 -0
  55. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  56. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/app/page.js +0 -0
  57. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  58. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  59. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/package-lock.json +0 -0
  60. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/package.json +0 -0
  61. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  62. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/server.js +0 -0
  63. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  64. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/__init__.py +0 -0
  65. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/aws/__init__.py +0 -0
  66. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/aws/s3.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/constants.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/data_utils.py +0 -0
  69. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/gcp/__init__.py +0 -0
  70. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/gcp/constants.py +0 -0
  71. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/gcp/gcs.py +0 -0
  72. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/gcp/utils.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/registry.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/storage.py +0 -0
  75. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/data/storage_utils.py +0 -0
  76. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/execution.py +0 -0
  77. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/kube_client.py +0 -0
  78. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/manifests/controller_deployment.yaml +0 -0
  79. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  80. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  81. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  82. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/resource.py +0 -0
  83. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/serving.py +0 -0
  84. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/task.py +0 -0
  85. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/templates/deployment.yaml.j2 +0 -0
  86. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/templates/pod.yaml.j2 +0 -0
  87. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/usage/__init__.py +0 -0
  88. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/usage/constants.py +0 -0
  89. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/__init__.py +0 -0
  90. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/accelerator_registry.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/annotations.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/base64_utils.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/common_utils.py +0 -0
  94. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/constants.py +0 -0
  95. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/env_options.py +0 -0
  96. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/exceptions.py +0 -0
  97. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/kubernetes_enums.py +0 -0
  98. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/kubernetes_utils.py +0 -0
  99. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/loki_utils.py +0 -0
  100. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/rich_utils.py +0 -0
  101. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/schemas.py +0 -0
  102. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/subprocess_utils.py +0 -0
  103. {konduktor_nightly-0.1.0.dev20250810104857 → konduktor_nightly-0.1.0.dev20250812105102}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250810104857
3
+ Version: 0.1.0.dev20250812105102
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = '0f0b36c3a67aa7c60d6cb33240631b7c8ccaed03'
14
+ _KONDUKTOR_COMMIT_SHA = 'f4ba2084fac1c1030245b475323f4f3a57fd3fa3'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20250810104857'
48
+ __version__ = '1.0.0.dev0.1.0.dev20250812105102'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -8,6 +8,7 @@ USERID_LABEL = 'trainy.ai/user-id'
8
8
  USER_LABEL = 'trainy.ai/username'
9
9
  ACCELERATOR_LABEL = 'trainy.ai/accelerator'
10
10
  NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
11
+ MAX_EXECUTION_TIME_LABEL = 'kueue.x-k8s.io/max-exec-time-seconds'
11
12
 
12
13
  # Start/stop/status related labels
13
14
  STOP_USERID_LABEL = 'trainy.ai/stop-userid'
@@ -176,7 +176,7 @@ class JobsetBackend(backend.Backend):
176
176
  context = kubernetes_utils.get_current_kube_config_context_name()
177
177
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
178
178
  # TODO(asaiacai): need to set env variables in pod
179
- jobset_utils.create_jobset(
179
+ jobset_response = jobset_utils.create_jobset(
180
180
  namespace,
181
181
  task,
182
182
  pod_spec['kubernetes']['pod_config'],
@@ -192,9 +192,10 @@ class JobsetBackend(backend.Backend):
192
192
  ):
193
193
  _wait_for_jobset_start(namespace, task.name)
194
194
  try:
195
+ assert jobset_response is not None
195
196
  log_thread = threading.Thread(
196
197
  target=log_utils.tail_logs,
197
- args=(task.name,),
198
+ args=(jobset_response,),
198
199
  daemon=True,
199
200
  )
200
201
  logger.info('streaming logs...')
@@ -39,6 +39,7 @@ JOBSET_USERID_LABEL = backend_constants.USERID_LABEL
39
39
  JOBSET_USER_LABEL = backend_constants.USER_LABEL
40
40
  JOBSET_ACCELERATOR_LABEL = backend_constants.ACCELERATOR_LABEL
41
41
  JOBSET_NUM_ACCELERATORS_LABEL = backend_constants.NUM_ACCELERATORS_LABEL
42
+ JOBSET_MAX_EXECUTION_TIME_LABEL = backend_constants.MAX_EXECUTION_TIME_LABEL
42
43
 
43
44
  SECRET_BASENAME_LABEL = backend_constants.SECRET_BASENAME_LABEL
44
45
 
@@ -48,6 +49,7 @@ _JOBSET_METADATA_LABELS = {
48
49
  'jobset_user_label': JOBSET_USER_LABEL,
49
50
  'jobset_accelerator_label': JOBSET_ACCELERATOR_LABEL,
50
51
  'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
52
+ 'jobset_max_execution_time_label': JOBSET_MAX_EXECUTION_TIME_LABEL,
51
53
  }
52
54
 
53
55
 
@@ -79,6 +81,7 @@ def create_jobset(
79
81
  assert task.resources is not None, 'Task resources are undefined'
80
82
  accelerator_type = task.resources.get_accelerator_type() or 'None'
81
83
  num_accelerators = task.resources.get_accelerator_count() or 0
84
+ labels = task.resources.labels if task.resources.labels else {}
82
85
  with tempfile.NamedTemporaryFile() as temp:
83
86
  common_utils.fill_template(
84
87
  'jobset.yaml.j2',
@@ -91,6 +94,7 @@ def create_jobset(
91
94
  'num_accelerators': num_accelerators,
92
95
  'completions': task.resources.get_completions(),
93
96
  'max_restarts': task.resources.get_max_restarts(),
97
+ 'max_execution_time': labels.get('maxRunDurationSeconds', None),
94
98
  **_JOBSET_METADATA_LABELS,
95
99
  },
96
100
  temp.name,
@@ -232,8 +236,12 @@ def stop_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
232
236
  'spec': {'suspend': True},
233
237
  'metadata': {
234
238
  'annotations': {
235
- backend_constants.STOP_USERID_LABEL: common_utils.user_and_hostname_hash(),
236
- backend_constants.STOP_USERNAME_LABEL: common_utils.get_cleaned_username(),
239
+ backend_constants.STOP_USERID_LABEL: (
240
+ common_utils.user_and_hostname_hash()
241
+ ),
242
+ backend_constants.STOP_USERNAME_LABEL: (
243
+ common_utils.get_cleaned_username()
244
+ ),
237
245
  }
238
246
  },
239
247
  }
@@ -426,6 +434,36 @@ def _parse_timestamp_filter(timestamp_str: str) -> datetime:
426
434
  )
427
435
 
428
436
 
437
+ def _format_timestamp(timestamp: str) -> str:
438
+ """Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
439
+ # Parse UTC timestamp and convert to local time
440
+ dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
441
+ tzinfo=timezone.utc
442
+ )
443
+ dt_local = dt_utc.astimezone() # Convert to local timezone
444
+ return dt_local.strftime('%m/%d/%y %I:%M%p')
445
+
446
+
447
+ def _get_job_start_time(job: Dict[str, Any]) -> str:
448
+ for condition in job['status']['conditions']:
449
+ if condition['reason'] == 'ResumeJobs':
450
+ return condition.get('lastTransitionTime', '')
451
+ return '-'
452
+
453
+
454
+ def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
455
+ """Extract end time from JobSet conditions (Completed or Failed)"""
456
+ conditions = job.get('status', {}).get('conditions', [])
457
+ for condition in conditions:
458
+ # Look for terminal conditions with status=True
459
+ if (
460
+ condition.get('type') in ['Completed', 'Failed']
461
+ and condition.get('status') == 'True'
462
+ ):
463
+ return condition.get('lastTransitionTime', '')
464
+ return '-'
465
+
466
+
429
467
  def show_status_table(
430
468
  namespace: str,
431
469
  all_users: bool,
@@ -519,15 +557,6 @@ def show_status_table(
519
557
  result = f'{days_str}{hours_str}{minutes_str}{seconds_str}'
520
558
  return result if result else '<1 minute', delta
521
559
 
522
- def _format_timestamp(timestamp: str) -> str:
523
- """Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
524
- # Parse UTC timestamp and convert to local time
525
- dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
526
- tzinfo=timezone.utc
527
- )
528
- dt_local = dt_utc.astimezone() # Convert to local timezone
529
- return dt_local.strftime('%m/%d/%y %I:%M%p')
530
-
531
560
  def _get_resources(job: Dict[str, Any]) -> str:
532
561
  num_pods = int(
533
562
  job['spec']['replicatedJobs'][0]['template']['spec']['parallelism']
@@ -587,25 +616,17 @@ def show_status_table(
587
616
  if before_dt and job_creation_time >= before_dt:
588
617
  continue
589
618
  # Get start time
590
- start_time = _format_timestamp(job['metadata']['creationTimestamp'])
619
+ start_time = _get_job_start_time(job)
620
+ if start_time != '-':
621
+ start_time = _format_timestamp(start_time)
591
622
 
592
623
  # Get submitted time (how long ago)
593
624
  submitted_time, _ = _get_time_delta(job['metadata']['creationTimestamp'])
594
625
 
595
626
  # Get end time (from JobSet conditions)
596
- def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
597
- """Extract end time from JobSet conditions (Completed or Failed)"""
598
- conditions = job.get('status', {}).get('conditions', [])
599
- for condition in conditions:
600
- # Look for terminal conditions with status=True
601
- if (
602
- condition.get('type') in ['Completed', 'Failed']
603
- and condition.get('status') == 'True'
604
- ):
605
- return _format_timestamp(condition.get('lastTransitionTime', ''))
606
- return '-'
607
-
608
627
  end_time = _get_end_time_from_conditions(job)
628
+ if end_time != '-':
629
+ end_time = _format_timestamp(end_time)
609
630
 
610
631
  if all_users:
611
632
  rows.append(
@@ -732,7 +732,7 @@ def logs(
732
732
  # Verify the job exists before attempting to tail logs
733
733
  # TODO(asaiacai): unify the 404 logic under jobset_utils
734
734
  try:
735
- jobset_utils.get_jobset(namespace, job_id)
735
+ jobset_response = jobset_utils.get_jobset(namespace, job_id)
736
736
  except jobset_utils.JobNotFoundError:
737
737
  raise click.UsageError(
738
738
  f"Job '{job_id}' not found in namespace "
@@ -741,12 +741,9 @@ def logs(
741
741
  f'{colorama.Style.RESET_ALL}.'
742
742
  )
743
743
 
744
- click.secho(
745
- 'Logs are tailed from 1 hour ago, ' 'to see more logs, check Grafana.',
746
- fg='yellow',
747
- )
744
+ assert isinstance(jobset_response, dict), f'jobset_response: {jobset_response}'
748
745
  log_utils.tail_logs(
749
- job_id,
746
+ jobset_response,
750
747
  worker_id=node_rank,
751
748
  follow=follow,
752
749
  num_logs=num_lines,
@@ -855,16 +852,23 @@ def launch(
855
852
  'Serving information detected. Use '
856
853
  '`konduktor serve launch` instead for serving.'
857
854
  )
855
+ try:
856
+ _launch_with_confirm(
857
+ task,
858
+ dryrun=dryrun,
859
+ detach_run=detach_run,
860
+ no_confirm=yes,
861
+ serving=bool(task.serving),
862
+ )
863
+ except KeyboardInterrupt:
864
+ click.secho(
865
+ f'Detaching... manage your job {task.name} with the following commands:',
866
+ fg='yellow',
867
+ bold=True,
868
+ )
858
869
 
859
- job_name = _launch_with_confirm(
860
- task,
861
- dryrun=dryrun,
862
- detach_run=detach_run,
863
- no_confirm=yes,
864
- serving=bool(task.serving),
865
- )
866
870
  click.secho(
867
- ux_utils.command_hint_messages(ux_utils.CommandHintType.JOB, job_name),
871
+ ux_utils.command_hint_messages(ux_utils.CommandHintType.JOB, task.name),
868
872
  fg='green',
869
873
  bold=True,
870
874
  )
@@ -1139,6 +1143,12 @@ def stop(
1139
1143
  for job in track(jobs, description='Suspending job(s)...'):
1140
1144
  jobset_utils.stop_jobset(namespace, job)
1141
1145
 
1146
+ click.secho(
1147
+ ux_utils.command_hint_messages(ux_utils.CommandHintType.JOB_STOP, jobs),
1148
+ fg='green',
1149
+ bold=True,
1150
+ )
1151
+
1142
1152
 
1143
1153
  @cli.command(cls=_DocumentedCodeCommand)
1144
1154
  @click.argument(
@@ -1836,7 +1846,7 @@ def main():
1836
1846
  return cli(standalone_mode=False)
1837
1847
  except click.exceptions.Abort:
1838
1848
  click.secho('Detaching...', fg='yellow', bold=True)
1839
- return
1849
+ return None
1840
1850
 
1841
1851
 
1842
1852
  if __name__ == '__main__':
@@ -75,12 +75,14 @@ def get_logger(name: str):
75
75
  fh.setFormatter(FORMATTER)
76
76
  logger.addHandler(fh)
77
77
 
78
- # --- Console logging: DEBUG level only if KONDUKTOR_DEBUG=1 ---
78
+ # --- Console logging: INFO level by default, DEBUG if KONDUKTOR_DEBUG=1 ---
79
+ ch = logging.StreamHandler()
79
80
  if os.environ.get('KONDUKTOR_DEBUG') == '1':
80
- ch = logging.StreamHandler()
81
81
  ch.setLevel(logging.DEBUG)
82
- ch.setFormatter(FORMATTER)
83
- logger.addHandler(ch)
82
+ else:
83
+ ch.setLevel(logging.INFO)
84
+ ch.setFormatter(FORMATTER)
85
+ logger.addHandler(ch)
84
86
 
85
87
  logger.propagate = False
86
88
  return logger
@@ -11,6 +11,9 @@ jobset:
11
11
  {{ jobset_accelerator_label }}: "{{ accelerator_type }}"
12
12
  {{ jobset_num_accelerators_label }}: "{{ num_accelerators }}"
13
13
  {% endif %}
14
+ {% if max_execution_time %}
15
+ {{ jobset_max_execution_time_label }}: "{{ max_execution_time }}"
16
+ {% endif %}
14
17
  trainy.ai/konduktor-managed: "true"
15
18
  parent: "trainy"
16
19
  annotations: {}
@@ -337,45 +337,51 @@ def tail_loki_logs_ws(
337
337
 
338
338
 
339
339
  def tail_vicky_logs(
340
- job_name: str,
340
+ jobset_response: Dict[str, Any],
341
341
  worker_id: int = 0,
342
- num_logs: int = 1000,
342
+ num_logs: int = -1,
343
343
  follow: bool = True,
344
344
  ):
345
+ job_name = jobset_response['metadata']['name']
345
346
  context = kubernetes_utils.get_current_kube_config_context_name()
346
347
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
347
348
  query: Dict[str, Any] = {}
348
- if num_logs > 5000:
349
- # TODO(asaiacai): we should not have a limit on the number of logs, but rather
350
- # let the user specify any number of lines, and we can print the last N lines.
351
- # this can be done in chunks. Potentially, we can query range
352
- # until we reach the end of the log and then invoke tail again.
353
- # Also include checks that the job is running/ever ran.
354
- raise ValueError('num_logs must be less than or equal to 5000')
355
- logger.info('ignoring num_logs argument for VictoriaLogs')
356
349
  vicky_svc = kr8s.objects.Service.get(
357
350
  'vls-victoria-logs-single-server', namespace='victoria-logs'
358
351
  )
352
+
353
+ if num_logs == -1:
354
+ query = {}
355
+ else:
356
+ assert num_logs > 0, f'num_logs must be greater than 0, got {num_logs}'
357
+ query = {'limit': num_logs}
358
+ if follow:
359
+ logger.info(
360
+ 'No end time found, tailing logs from 1 hour ago. '
361
+ 'If logs come up empty, there might be logs just earlier '
362
+ 'than the past hour, check Grafana or use:\n'
363
+ f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
364
+ f'`konduktor tail --no-follow {job_name}`'
365
+ f'{colorama.Style.RESET_ALL}'
366
+ )
367
+ query['start_offset'] = '1h'
368
+ query['query'] = (
369
+ f'k8s.namespace.name: "{namespace}" AND '
370
+ f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
371
+ f'batch.kubernetes.io/job-completion-index: "{worker_id}"'
372
+ )
373
+
359
374
  with kr8s.portforward.PortForward(
360
375
  vicky_svc, VICKY_REMOTE_PORT, local_port='auto'
361
376
  ) as port:
362
377
  if follow:
363
378
  timeout = INFINITY
364
379
  vicky_url = f'http://localhost:{port}/select/logsql/tail'
365
- query = {}
366
380
  else:
367
381
  vicky_url = f'http://localhost:{port}/select/logsql/query'
368
- query = {'limit': num_logs}
369
382
  timeout = 1
370
383
  logger.debug(f'Vicky URL: {vicky_url}')
371
384
 
372
- query['query'] = (
373
- f'k8s.namespace.name: "{namespace}" AND '
374
- f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
375
- f'batch.kubernetes.io/job-completion-index: "{worker_id}"'
376
- )
377
- query['start_offset'] = '1h'
378
-
379
385
  try:
380
386
  logger.debug(f'Making request to {vicky_url} with query: {query}')
381
387
  with requests.post(
@@ -412,16 +418,17 @@ def tail_vicky_logs(
412
418
 
413
419
 
414
420
  def tail_logs(
415
- job_name: str,
421
+ jobset_response: Dict[str, Any],
416
422
  worker_id: int = 0,
417
423
  num_logs: int = 1000,
418
424
  follow: bool = True,
419
425
  ):
426
+ job_name = jobset_response['metadata']['name']
420
427
  logs_backend = config.get_nested(('logs', 'backend'), None)
421
428
  if logs_backend == LogBackend.VICTORIA:
422
- tail_vicky_logs(job_name, worker_id, num_logs, follow)
429
+ tail_vicky_logs(jobset_response, worker_id, num_logs, follow)
423
430
  elif logs_backend == LogBackend.LOKI:
424
431
  tail_loki_logs_ws(job_name, worker_id, num_logs, follow)
425
432
  else:
426
433
  logger.info('Defaulting to VictoriaLogs')
427
- tail_vicky_logs(job_name, worker_id, num_logs, follow)
434
+ tail_vicky_logs(jobset_response, worker_id, num_logs, follow)
@@ -6,7 +6,7 @@ import os
6
6
  import sys
7
7
  import traceback
8
8
  import typing
9
- from typing import Callable, Optional, Union
9
+ from typing import Callable, List, Optional, Union
10
10
 
11
11
  import colorama
12
12
  import rich.console as rich_console
@@ -196,27 +196,41 @@ def spinner_message(
196
196
 
197
197
  class CommandHintType(enum.Enum):
198
198
  JOB = 'JOB'
199
+ JOB_STOP = 'JOB_STOP'
199
200
 
200
201
 
201
- def command_hint_messages(hint_type: CommandHintType, job_id: str) -> str:
202
+ def command_hint_messages(
203
+ hint_type: CommandHintType,
204
+ job_id: Union[str, List[str]],
205
+ ) -> str:
202
206
  """Gets the command hint messages for the given job id."""
207
+ hint_str = '\n📋 Useful Commands'
203
208
  if hint_type == CommandHintType.JOB:
204
209
  job_hint_str = (
205
210
  f'\nJob ID: {job_id}'
206
- f'\n{INDENT_SYMBOL}To cancel the job:\t\t'
207
- f'{BOLD}konduktor down {job_id} {RESET_BOLD}'
208
211
  f'\n{INDENT_SYMBOL}To stream job logs:\t\t'
209
212
  f'{BOLD}konduktor logs {job_id} {RESET_BOLD}'
210
213
  f'\n{INDENT_SYMBOL}To list all jobs:\t\t'
211
214
  f'{BOLD}konduktor status{RESET_BOLD}'
215
+ f'\n{INDENT_SYMBOL}To suspend the job:\t\t'
216
+ f'{BOLD}konduktor stop {job_id} {RESET_BOLD}'
217
+ f'\n{INDENT_SYMBOL}{colorama.Fore.RED}To delete the job:\t\t'
218
+ f'{BOLD}konduktor down {job_id} {RESET_BOLD}{colorama.Style.RESET_ALL}'
212
219
  )
213
- hint_str = '\n📋 Useful Commands'
214
220
  hint_str += f'{job_hint_str}'
215
- if config.get_nested(('tailscale', 'secret_name'), None) is not None:
216
- hint_str += (
217
- f'\n{INDENT_SYMBOL}To tailscale ssh:\t\t'
218
- f'{BOLD}ssh root@{job_id}-workers-0-0 {RESET_BOLD}'
219
- )
220
- return hint_str
221
+ elif hint_type == CommandHintType.JOB_STOP:
222
+ assert isinstance(job_id, list), 'job_id must be a list of strings'
223
+ job_ids_str = ' '.join(job_id)
224
+ hint_str += (
225
+ f'\n{INDENT_SYMBOL}To resume the following jobs:\t\t'
226
+ f'{BOLD}konduktor start {job_ids_str} {RESET_BOLD}'
227
+ )
221
228
  else:
222
229
  raise ValueError(f'Invalid hint type: {hint_type}')
230
+
231
+ if config.get_nested(('tailscale', 'secret_name'), None) is not None:
232
+ hint_str += (
233
+ f'\n{INDENT_SYMBOL}To tailscale ssh:\t\t'
234
+ f'{BOLD}ssh root@{job_id}-workers-0-0 {RESET_BOLD}'
235
+ )
236
+ return hint_str
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "konduktor-nightly"
3
- version = "0.1.0.dev20250810104857"
3
+ version = "0.1.0.dev20250812105102"
4
4
  description = "GPU Cluster Health Management"
5
5
  packages = [
6
6
  {include = "konduktor"}