konduktor-nightly 0.1.0.dev20250811105223__tar.gz → 0.1.0.dev20250813105033__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of konduktor-nightly might be problematic. Click here for more details.

Files changed (103) hide show
  1. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/PKG-INFO +1 -1
  2. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/backends/constants.py +1 -0
  4. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/backends/jobset.py +3 -2
  5. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/backends/jobset_utils.py +39 -22
  6. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/cli.py +3 -6
  7. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/templates/jobset.yaml.j2 +3 -0
  8. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/log_utils.py +29 -22
  9. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/pyproject.toml +1 -1
  10. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/LICENSE +0 -0
  11. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/README.md +0 -0
  12. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/adaptors/__init__.py +0 -0
  13. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/adaptors/aws.py +0 -0
  14. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/adaptors/common.py +0 -0
  15. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/adaptors/gcp.py +0 -0
  16. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/authentication.py +0 -0
  17. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/backends/__init__.py +0 -0
  18. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/backends/backend.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/backends/deployment.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/backends/deployment_utils.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/backends/pod_utils.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/check.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/config.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/constants.py +0 -0
  25. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/controller/__init__.py +0 -0
  26. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/controller/constants.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/controller/launch.py +0 -0
  28. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/controller/node.py +0 -0
  29. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/controller/parse.py +0 -0
  30. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/README.md +0 -0
  31. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/backend/main.py +0 -0
  32. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/backend/sockets.py +0 -0
  33. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  34. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/.gitignore +0 -0
  35. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  36. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  37. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  38. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  39. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  40. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  41. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  42. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  43. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  44. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  45. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  46. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  47. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  48. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  49. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  50. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/globals.css +0 -0
  51. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  52. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/layout.js +0 -0
  53. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  54. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/app/page.js +0 -0
  55. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  56. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  57. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/package-lock.json +0 -0
  58. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/package.json +0 -0
  59. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  60. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/server.js +0 -0
  61. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  62. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/data/__init__.py +0 -0
  63. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/data/aws/__init__.py +0 -0
  64. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/data/aws/s3.py +0 -0
  65. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/data/constants.py +0 -0
  66. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/data/data_utils.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/data/gcp/__init__.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/data/gcp/constants.py +0 -0
  69. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/data/gcp/gcs.py +0 -0
  70. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/data/gcp/utils.py +0 -0
  71. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/data/registry.py +0 -0
  72. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/data/storage.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/data/storage_utils.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/execution.py +0 -0
  75. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/kube_client.py +0 -0
  76. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/logging.py +0 -0
  77. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/manifests/controller_deployment.yaml +0 -0
  78. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  79. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  80. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  81. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/resource.py +0 -0
  82. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/serving.py +0 -0
  83. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/task.py +0 -0
  84. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/templates/deployment.yaml.j2 +0 -0
  85. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/templates/pod.yaml.j2 +0 -0
  86. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/usage/__init__.py +0 -0
  87. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/usage/constants.py +0 -0
  88. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/__init__.py +0 -0
  89. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/accelerator_registry.py +0 -0
  90. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/annotations.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/base64_utils.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/common_utils.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/constants.py +0 -0
  94. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/env_options.py +0 -0
  95. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/exceptions.py +0 -0
  96. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/kubernetes_enums.py +0 -0
  97. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/kubernetes_utils.py +0 -0
  98. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/loki_utils.py +0 -0
  99. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/rich_utils.py +0 -0
  100. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/schemas.py +0 -0
  101. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/subprocess_utils.py +0 -0
  102. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/ux_utils.py +0 -0
  103. {konduktor_nightly-0.1.0.dev20250811105223 → konduktor_nightly-0.1.0.dev20250813105033}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250811105223
3
+ Version: 0.1.0.dev20250813105033
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = '92fe69bd3f29e7b191de663c598dfcf10738f87a'
14
+ _KONDUKTOR_COMMIT_SHA = 'f4ba2084fac1c1030245b475323f4f3a57fd3fa3'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20250811105223'
48
+ __version__ = '1.0.0.dev0.1.0.dev20250813105033'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -8,6 +8,7 @@ USERID_LABEL = 'trainy.ai/user-id'
8
8
  USER_LABEL = 'trainy.ai/username'
9
9
  ACCELERATOR_LABEL = 'trainy.ai/accelerator'
10
10
  NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
11
+ MAX_EXECUTION_TIME_LABEL = 'kueue.x-k8s.io/max-exec-time-seconds'
11
12
 
12
13
  # Start/stop/status related labels
13
14
  STOP_USERID_LABEL = 'trainy.ai/stop-userid'
@@ -176,7 +176,7 @@ class JobsetBackend(backend.Backend):
176
176
  context = kubernetes_utils.get_current_kube_config_context_name()
177
177
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
178
178
  # TODO(asaiacai): need to set env variables in pod
179
- jobset_utils.create_jobset(
179
+ jobset_response = jobset_utils.create_jobset(
180
180
  namespace,
181
181
  task,
182
182
  pod_spec['kubernetes']['pod_config'],
@@ -192,9 +192,10 @@ class JobsetBackend(backend.Backend):
192
192
  ):
193
193
  _wait_for_jobset_start(namespace, task.name)
194
194
  try:
195
+ assert jobset_response is not None
195
196
  log_thread = threading.Thread(
196
197
  target=log_utils.tail_logs,
197
- args=(task.name,),
198
+ args=(jobset_response,),
198
199
  daemon=True,
199
200
  )
200
201
  logger.info('streaming logs...')
@@ -39,6 +39,7 @@ JOBSET_USERID_LABEL = backend_constants.USERID_LABEL
39
39
  JOBSET_USER_LABEL = backend_constants.USER_LABEL
40
40
  JOBSET_ACCELERATOR_LABEL = backend_constants.ACCELERATOR_LABEL
41
41
  JOBSET_NUM_ACCELERATORS_LABEL = backend_constants.NUM_ACCELERATORS_LABEL
42
+ JOBSET_MAX_EXECUTION_TIME_LABEL = backend_constants.MAX_EXECUTION_TIME_LABEL
42
43
 
43
44
  SECRET_BASENAME_LABEL = backend_constants.SECRET_BASENAME_LABEL
44
45
 
@@ -48,6 +49,7 @@ _JOBSET_METADATA_LABELS = {
48
49
  'jobset_user_label': JOBSET_USER_LABEL,
49
50
  'jobset_accelerator_label': JOBSET_ACCELERATOR_LABEL,
50
51
  'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
52
+ 'jobset_max_execution_time_label': JOBSET_MAX_EXECUTION_TIME_LABEL,
51
53
  }
52
54
 
53
55
 
@@ -79,6 +81,7 @@ def create_jobset(
79
81
  assert task.resources is not None, 'Task resources are undefined'
80
82
  accelerator_type = task.resources.get_accelerator_type() or 'None'
81
83
  num_accelerators = task.resources.get_accelerator_count() or 0
84
+ labels = task.resources.labels if task.resources.labels else {}
82
85
  with tempfile.NamedTemporaryFile() as temp:
83
86
  common_utils.fill_template(
84
87
  'jobset.yaml.j2',
@@ -91,6 +94,7 @@ def create_jobset(
91
94
  'num_accelerators': num_accelerators,
92
95
  'completions': task.resources.get_completions(),
93
96
  'max_restarts': task.resources.get_max_restarts(),
97
+ 'max_execution_time': labels.get('maxRunDurationSeconds', None),
94
98
  **_JOBSET_METADATA_LABELS,
95
99
  },
96
100
  temp.name,
@@ -430,6 +434,36 @@ def _parse_timestamp_filter(timestamp_str: str) -> datetime:
430
434
  )
431
435
 
432
436
 
437
+ def _format_timestamp(timestamp: str) -> str:
438
+ """Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
439
+ # Parse UTC timestamp and convert to local time
440
+ dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
441
+ tzinfo=timezone.utc
442
+ )
443
+ dt_local = dt_utc.astimezone() # Convert to local timezone
444
+ return dt_local.strftime('%m/%d/%y %I:%M%p')
445
+
446
+
447
+ def _get_job_start_time(job: Dict[str, Any]) -> str:
448
+ for condition in job['status']['conditions']:
449
+ if condition['reason'] == 'ResumeJobs':
450
+ return condition.get('lastTransitionTime', '')
451
+ return '-'
452
+
453
+
454
+ def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
455
+ """Extract end time from JobSet conditions (Completed or Failed)"""
456
+ conditions = job.get('status', {}).get('conditions', [])
457
+ for condition in conditions:
458
+ # Look for terminal conditions with status=True
459
+ if (
460
+ condition.get('type') in ['Completed', 'Failed']
461
+ and condition.get('status') == 'True'
462
+ ):
463
+ return condition.get('lastTransitionTime', '')
464
+ return '-'
465
+
466
+
433
467
  def show_status_table(
434
468
  namespace: str,
435
469
  all_users: bool,
@@ -523,15 +557,6 @@ def show_status_table(
523
557
  result = f'{days_str}{hours_str}{minutes_str}{seconds_str}'
524
558
  return result if result else '<1 minute', delta
525
559
 
526
- def _format_timestamp(timestamp: str) -> str:
527
- """Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
528
- # Parse UTC timestamp and convert to local time
529
- dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
530
- tzinfo=timezone.utc
531
- )
532
- dt_local = dt_utc.astimezone() # Convert to local timezone
533
- return dt_local.strftime('%m/%d/%y %I:%M%p')
534
-
535
560
  def _get_resources(job: Dict[str, Any]) -> str:
536
561
  num_pods = int(
537
562
  job['spec']['replicatedJobs'][0]['template']['spec']['parallelism']
@@ -591,25 +616,17 @@ def show_status_table(
591
616
  if before_dt and job_creation_time >= before_dt:
592
617
  continue
593
618
  # Get start time
594
- start_time = _format_timestamp(job['metadata']['creationTimestamp'])
619
+ start_time = _get_job_start_time(job)
620
+ if start_time != '-':
621
+ start_time = _format_timestamp(start_time)
595
622
 
596
623
  # Get submitted time (how long ago)
597
624
  submitted_time, _ = _get_time_delta(job['metadata']['creationTimestamp'])
598
625
 
599
626
  # Get end time (from JobSet conditions)
600
- def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
601
- """Extract end time from JobSet conditions (Completed or Failed)"""
602
- conditions = job.get('status', {}).get('conditions', [])
603
- for condition in conditions:
604
- # Look for terminal conditions with status=True
605
- if (
606
- condition.get('type') in ['Completed', 'Failed']
607
- and condition.get('status') == 'True'
608
- ):
609
- return _format_timestamp(condition.get('lastTransitionTime', ''))
610
- return '-'
611
-
612
627
  end_time = _get_end_time_from_conditions(job)
628
+ if end_time != '-':
629
+ end_time = _format_timestamp(end_time)
613
630
 
614
631
  if all_users:
615
632
  rows.append(
@@ -732,7 +732,7 @@ def logs(
732
732
  # Verify the job exists before attempting to tail logs
733
733
  # TODO(asaiacai): unify the 404 logic under jobset_utils
734
734
  try:
735
- jobset_utils.get_jobset(namespace, job_id)
735
+ jobset_response = jobset_utils.get_jobset(namespace, job_id)
736
736
  except jobset_utils.JobNotFoundError:
737
737
  raise click.UsageError(
738
738
  f"Job '{job_id}' not found in namespace "
@@ -741,12 +741,9 @@ def logs(
741
741
  f'{colorama.Style.RESET_ALL}.'
742
742
  )
743
743
 
744
- click.secho(
745
- 'Logs are tailed from 1 hour ago, ' 'to see more logs, check Grafana.',
746
- fg='yellow',
747
- )
744
+ assert isinstance(jobset_response, dict), f'jobset_response: {jobset_response}'
748
745
  log_utils.tail_logs(
749
- job_id,
746
+ jobset_response,
750
747
  worker_id=node_rank,
751
748
  follow=follow,
752
749
  num_logs=num_lines,
@@ -11,6 +11,9 @@ jobset:
11
11
  {{ jobset_accelerator_label }}: "{{ accelerator_type }}"
12
12
  {{ jobset_num_accelerators_label }}: "{{ num_accelerators }}"
13
13
  {% endif %}
14
+ {% if max_execution_time %}
15
+ {{ jobset_max_execution_time_label }}: "{{ max_execution_time }}"
16
+ {% endif %}
14
17
  trainy.ai/konduktor-managed: "true"
15
18
  parent: "trainy"
16
19
  annotations: {}
@@ -337,45 +337,51 @@ def tail_loki_logs_ws(
337
337
 
338
338
 
339
339
  def tail_vicky_logs(
340
- job_name: str,
340
+ jobset_response: Dict[str, Any],
341
341
  worker_id: int = 0,
342
- num_logs: int = 1000,
342
+ num_logs: int = -1,
343
343
  follow: bool = True,
344
344
  ):
345
+ job_name = jobset_response['metadata']['name']
345
346
  context = kubernetes_utils.get_current_kube_config_context_name()
346
347
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
347
348
  query: Dict[str, Any] = {}
348
- if num_logs > 5000:
349
- # TODO(asaiacai): we should not have a limit on the number of logs, but rather
350
- # let the user specify any number of lines, and we can print the last N lines.
351
- # this can be done in chunks. Potentially, we can query range
352
- # until we reach the end of the log and then invoke tail again.
353
- # Also include checks that the job is running/ever ran.
354
- raise ValueError('num_logs must be less than or equal to 5000')
355
- logger.info('ignoring num_logs argument for VictoriaLogs')
356
349
  vicky_svc = kr8s.objects.Service.get(
357
350
  'vls-victoria-logs-single-server', namespace='victoria-logs'
358
351
  )
352
+
353
+ if num_logs == -1:
354
+ query = {}
355
+ else:
356
+ assert num_logs > 0, f'num_logs must be greater than 0, got {num_logs}'
357
+ query = {'limit': num_logs}
358
+ if follow:
359
+ logger.info(
360
+ 'No end time found, tailing logs from 1 hour ago. '
361
+ 'If logs come up empty, there might be logs just earlier '
362
+ 'than the past hour, check Grafana or use:\n'
363
+ f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
364
+ f'`konduktor tail --no-follow {job_name}`'
365
+ f'{colorama.Style.RESET_ALL}'
366
+ )
367
+ query['start_offset'] = '1h'
368
+ query['query'] = (
369
+ f'k8s.namespace.name: "{namespace}" AND '
370
+ f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
371
+ f'batch.kubernetes.io/job-completion-index: "{worker_id}"'
372
+ )
373
+
359
374
  with kr8s.portforward.PortForward(
360
375
  vicky_svc, VICKY_REMOTE_PORT, local_port='auto'
361
376
  ) as port:
362
377
  if follow:
363
378
  timeout = INFINITY
364
379
  vicky_url = f'http://localhost:{port}/select/logsql/tail'
365
- query = {}
366
380
  else:
367
381
  vicky_url = f'http://localhost:{port}/select/logsql/query'
368
- query = {'limit': num_logs}
369
382
  timeout = 1
370
383
  logger.debug(f'Vicky URL: {vicky_url}')
371
384
 
372
- query['query'] = (
373
- f'k8s.namespace.name: "{namespace}" AND '
374
- f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
375
- f'batch.kubernetes.io/job-completion-index: "{worker_id}"'
376
- )
377
- query['start_offset'] = '1h'
378
-
379
385
  try:
380
386
  logger.debug(f'Making request to {vicky_url} with query: {query}')
381
387
  with requests.post(
@@ -412,16 +418,17 @@ def tail_vicky_logs(
412
418
 
413
419
 
414
420
  def tail_logs(
415
- job_name: str,
421
+ jobset_response: Dict[str, Any],
416
422
  worker_id: int = 0,
417
423
  num_logs: int = 1000,
418
424
  follow: bool = True,
419
425
  ):
426
+ job_name = jobset_response['metadata']['name']
420
427
  logs_backend = config.get_nested(('logs', 'backend'), None)
421
428
  if logs_backend == LogBackend.VICTORIA:
422
- tail_vicky_logs(job_name, worker_id, num_logs, follow)
429
+ tail_vicky_logs(jobset_response, worker_id, num_logs, follow)
423
430
  elif logs_backend == LogBackend.LOKI:
424
431
  tail_loki_logs_ws(job_name, worker_id, num_logs, follow)
425
432
  else:
426
433
  logger.info('Defaulting to VictoriaLogs')
427
- tail_vicky_logs(job_name, worker_id, num_logs, follow)
434
+ tail_vicky_logs(jobset_response, worker_id, num_logs, follow)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "konduktor-nightly"
3
- version = "0.1.0.dev20250811105223"
3
+ version = "0.1.0.dev20250813105033"
4
4
  description = "GPU Cluster Health Management"
5
5
  packages = [
6
6
  {include = "konduktor"}