konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,726 @@
1
+ """Jobset utils: wraps CRUD operations for jobsets"""
2
+
3
+ import enum
4
+ import json
5
+ import tempfile
6
+ import time
7
+ import typing
8
+ from datetime import datetime, timedelta, timezone
9
+ from typing import Any, Dict, Optional, Tuple
10
+
11
+ import click
12
+ import colorama
13
+
14
+ import konduktor
15
+ from konduktor import kube_client, logging
16
+ from konduktor.backends import constants as backend_constants
17
+ from konduktor.backends import pod_utils
18
+ from konduktor.utils import (
19
+ common_utils,
20
+ kubernetes_utils,
21
+ log_utils,
22
+ )
23
+
24
+ if typing.TYPE_CHECKING:
25
+ pass
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+ JOBSET_API_GROUP = 'jobset.x-k8s.io'
30
+ JOBSET_API_VERSION = 'v1alpha2'
31
+ JOBSET_PLURAL = 'jobsets'
32
+
33
+ # Use shared constants from konduktor.backends.constants
34
+ JOBSET_NAME_LABEL = backend_constants.JOB_NAME_LABEL
35
+ JOBSET_USERID_LABEL = backend_constants.USERID_LABEL
36
+ JOBSET_USER_LABEL = backend_constants.USER_LABEL
37
+ JOBSET_ACCELERATOR_LABEL = backend_constants.ACCELERATOR_LABEL
38
+ JOBSET_NUM_ACCELERATORS_LABEL = backend_constants.NUM_ACCELERATORS_LABEL
39
+ JOBSET_MAX_EXECUTION_TIME_LABEL = backend_constants.MAX_EXECUTION_TIME_LABEL
40
+
41
+ SECRET_BASENAME_LABEL = backend_constants.SECRET_BASENAME_LABEL
42
+
43
+ _JOBSET_METADATA_LABELS = {
44
+ 'jobset_name_label': JOBSET_NAME_LABEL,
45
+ 'jobset_userid_label': JOBSET_USERID_LABEL,
46
+ 'jobset_user_label': JOBSET_USER_LABEL,
47
+ 'jobset_accelerator_label': JOBSET_ACCELERATOR_LABEL,
48
+ 'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
49
+ 'jobset_max_execution_time_label': JOBSET_MAX_EXECUTION_TIME_LABEL,
50
+ }
51
+
52
+
53
+ class JobNotFoundError(Exception):
54
+ pass
55
+
56
+
57
+ class JobStatus(enum.Enum):
58
+ SUSPENDED = 'SUSPENDED'
59
+ ACTIVE = 'ACTIVE'
60
+ COMPLETED = 'COMPLETED'
61
+ FAILED = 'FAILED'
62
+ PENDING = 'PENDING'
63
+
64
+
65
+ if typing.TYPE_CHECKING:
66
+ import konduktor
67
+
68
+
69
+ def create_jobset(
70
+ namespace: str,
71
+ task: 'konduktor.Task',
72
+ pod_spec: Dict[str, Any],
73
+ dryrun: bool = False,
74
+ ) -> Optional[Dict[str, Any]]:
75
+ """Creates a jobset based on the task definition and pod spec
76
+ and returns the created jobset spec
77
+ """
78
+ assert task.resources is not None, 'Task resources are undefined'
79
+ accelerator_type = task.resources.get_accelerator_type() or 'None'
80
+ num_accelerators = task.resources.get_accelerator_count() or 0
81
+ labels = task.resources.labels if task.resources.labels else {}
82
+ with tempfile.NamedTemporaryFile() as temp:
83
+ common_utils.fill_template(
84
+ 'jobset.yaml.j2',
85
+ {
86
+ 'job_name': task.name,
87
+ 'user_id': common_utils.user_and_hostname_hash(),
88
+ 'num_nodes': task.num_nodes,
89
+ 'user': common_utils.get_cleaned_username(),
90
+ 'accelerator_type': accelerator_type,
91
+ 'num_accelerators': num_accelerators,
92
+ 'completions': task.resources.get_completions(),
93
+ 'max_restarts': task.resources.get_max_restarts(),
94
+ 'max_execution_time': labels.get('maxRunDurationSeconds', None),
95
+ **_JOBSET_METADATA_LABELS,
96
+ },
97
+ temp.name,
98
+ )
99
+ jobset_spec = common_utils.read_yaml(temp.name)
100
+ # Inject JobSet metadata (labels and annotations)
101
+ pod_utils.inject_jobset_metadata(jobset_spec, task)
102
+
103
+ # Merge pod spec into JobSet template
104
+ pod_utils.merge_pod_into_jobset_template(jobset_spec, pod_spec)
105
+ try:
106
+ context = kubernetes_utils.get_current_kube_config_context_name()
107
+ jobset = kube_client.crd_api(context=context).create_namespaced_custom_object(
108
+ group=JOBSET_API_GROUP,
109
+ version=JOBSET_API_VERSION,
110
+ namespace=namespace,
111
+ plural=JOBSET_PLURAL,
112
+ body=jobset_spec['jobset'],
113
+ dry_run='All' if dryrun else None,
114
+ )
115
+ logger.info(
116
+ f'task {colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
117
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{task.name}'
118
+ f'{colorama.Style.RESET_ALL} created in context '
119
+ f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}{context}'
120
+ f'{colorama.Style.RESET_ALL}, namespace '
121
+ f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}{namespace}'
122
+ f'{colorama.Style.RESET_ALL}'
123
+ )
124
+ return jobset
125
+ except kube_client.api_exception() as err:
126
+ try:
127
+ error_body = json.loads(err.body)
128
+ error_message = error_body.get('message', '')
129
+ logger.error(f'error creating jobset: {error_message}')
130
+ except json.JSONDecodeError:
131
+ error_message = str(err.body)
132
+ logger.error(f'error creating jobset: {error_message}')
133
+ else:
134
+ # Re-raise the exception if it's a different error
135
+ raise err
136
+ return None
137
+
138
+
139
+ def list_jobset(namespace: str) -> Optional[Dict[str, Any]]:
140
+ """Lists all jobsets in this namespace"""
141
+ try:
142
+ context = kubernetes_utils.get_current_kube_config_context_name()
143
+ response = kube_client.crd_api(context=context).list_namespaced_custom_object(
144
+ group=JOBSET_API_GROUP,
145
+ version=JOBSET_API_VERSION,
146
+ namespace=namespace,
147
+ plural=JOBSET_PLURAL,
148
+ )
149
+ return response
150
+ except kube_client.api_exception() as err:
151
+ try:
152
+ error_body = json.loads(err.body)
153
+ error_message = error_body.get('message', '')
154
+ logger.error(f'error listing jobset: {error_message}')
155
+ except json.JSONDecodeError:
156
+ error_message = str(err.body)
157
+ logger.error(f'error creating jobset: {error_message}')
158
+ else:
159
+ # Re-raise the exception if it's a different error
160
+ raise err
161
+ return None
162
+
163
+
164
+ def get_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
165
+ """Retrieves jobset in this namespace"""
166
+ try:
167
+ context = kubernetes_utils.get_current_kube_config_context_name()
168
+ response = kube_client.crd_api(context=context).get_namespaced_custom_object(
169
+ group=JOBSET_API_GROUP,
170
+ version=JOBSET_API_VERSION,
171
+ namespace=namespace,
172
+ plural=JOBSET_PLURAL,
173
+ name=job_name,
174
+ )
175
+ return response
176
+ except kube_client.api_exception() as err:
177
+ if err.status == 404:
178
+ raise JobNotFoundError(
179
+ f"Jobset '{job_name}' " f"not found in namespace '{namespace}'."
180
+ )
181
+ try:
182
+ error_body = json.loads(err.body)
183
+ error_message = error_body.get('message', '')
184
+ logger.error(f'error getting jobset: {error_message}')
185
+ except json.JSONDecodeError:
186
+ error_message = str(err.body)
187
+ logger.error(f'error creating jobset: {error_message}')
188
+ else:
189
+ # Re-raise the exception if it's a different error
190
+ raise err
191
+ return None
192
+
193
+
194
+ def delete_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
195
+ """Deletes jobset in this namespace
196
+
197
+ Args:
198
+ namespace: Namespace where jobset exists
199
+ job_name: Name of jobset to delete
200
+
201
+ Returns:
202
+ Response from delete operation
203
+ """
204
+ try:
205
+ context = kubernetes_utils.get_current_kube_config_context_name()
206
+ response = kube_client.crd_api(context=context).delete_namespaced_custom_object(
207
+ group=JOBSET_API_GROUP,
208
+ version=JOBSET_API_VERSION,
209
+ namespace=namespace,
210
+ plural=JOBSET_PLURAL,
211
+ name=job_name,
212
+ )
213
+ return response
214
+ except kube_client.api_exception() as err:
215
+ try:
216
+ error_body = json.loads(err.body)
217
+ error_message = error_body.get('message', '')
218
+ logger.error(f'error deleting jobset: {error_message}')
219
+ except json.JSONDecodeError:
220
+ error_message = str(err.body)
221
+ logger.error(f'error deleting jobset: {error_message}')
222
+ else:
223
+ # Re-raise the exception if it's a different error
224
+ raise err
225
+ return None
226
+
227
+
228
+ def stop_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
229
+ """Stops jobset in this namespace"""
230
+ context = kubernetes_utils.get_current_kube_config_context_name()
231
+ try:
232
+ # First check if the job exists
233
+ get_jobset(namespace, job_name)
234
+
235
+ # Apply patch to suspend the jobset and add annotations
236
+ # Time is in UTC but gets converted to local timezone in the konduktor status UI
237
+ patch = {
238
+ 'spec': {'suspend': True},
239
+ 'metadata': {
240
+ 'annotations': {
241
+ backend_constants.STOP_USERID_LABEL: (
242
+ common_utils.user_and_hostname_hash()
243
+ ),
244
+ backend_constants.STOP_USERNAME_LABEL: (
245
+ common_utils.get_cleaned_username()
246
+ ),
247
+ }
248
+ },
249
+ }
250
+ response = kube_client.crd_api(context=context).patch_namespaced_custom_object(
251
+ group=JOBSET_API_GROUP,
252
+ version=JOBSET_API_VERSION,
253
+ namespace=namespace,
254
+ plural=JOBSET_PLURAL,
255
+ name=job_name,
256
+ body=patch,
257
+ )
258
+
259
+ # Also suspend the associated Kueue workload to prevent automatic resumption
260
+ try:
261
+ # Find the workload for this jobset
262
+ workloads = kube_client.crd_api(
263
+ context=context
264
+ ).list_namespaced_custom_object(
265
+ group='kueue.x-k8s.io',
266
+ version='v1beta1',
267
+ namespace=namespace,
268
+ plural='workloads',
269
+ )
270
+ for workload in workloads.get('items', []):
271
+ if workload['metadata']['name'].startswith(f'jobset-{job_name}-'):
272
+ # Suspend the workload
273
+ workload_patch = {'spec': {'active': False}}
274
+ kube_client.crd_api(context=context).patch_namespaced_custom_object(
275
+ group='kueue.x-k8s.io',
276
+ version='v1beta1',
277
+ namespace=namespace,
278
+ plural='workloads',
279
+ name=workload['metadata']['name'],
280
+ body=workload_patch,
281
+ )
282
+ break
283
+ except Exception:
284
+ # If workload suspension fails, continue (JobSet suspension still worked)
285
+ pass
286
+
287
+ return response
288
+ except kube_client.api_exception() as e:
289
+ if e.status == 404:
290
+ raise JobNotFoundError(f'Job {job_name} not found in namespace {namespace}')
291
+ else:
292
+ raise e
293
+
294
+
295
+ def start_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
296
+ """Starts jobset in this namespace"""
297
+ context = kubernetes_utils.get_current_kube_config_context_name()
298
+ try:
299
+ # First check if the job exists
300
+ get_jobset(namespace, job_name)
301
+
302
+ # Apply patch to resume the jobset and remove suspension annotations
303
+ patch = {
304
+ 'spec': {'suspend': False},
305
+ 'metadata': {
306
+ 'annotations': {
307
+ backend_constants.STOP_USERID_LABEL: None,
308
+ backend_constants.STOP_USERNAME_LABEL: None,
309
+ }
310
+ },
311
+ }
312
+ response = kube_client.crd_api(context=context).patch_namespaced_custom_object(
313
+ group=JOBSET_API_GROUP,
314
+ version=JOBSET_API_VERSION,
315
+ namespace=namespace,
316
+ plural=JOBSET_PLURAL,
317
+ name=job_name,
318
+ body=patch,
319
+ )
320
+
321
+ # Also reactivate the associated Kueue workload
322
+ try:
323
+ # Find the workload for this jobset
324
+ workloads = kube_client.crd_api(
325
+ context=context
326
+ ).list_namespaced_custom_object(
327
+ group='kueue.x-k8s.io',
328
+ version='v1beta1',
329
+ namespace=namespace,
330
+ plural='workloads',
331
+ )
332
+ for workload in workloads.get('items', []):
333
+ if workload['metadata']['name'].startswith(f'jobset-{job_name}-'):
334
+ # Reactivate the workload
335
+ workload_patch = {'spec': {'active': True}}
336
+ kube_client.crd_api(context=context).patch_namespaced_custom_object(
337
+ group='kueue.x-k8s.io',
338
+ version='v1beta1',
339
+ namespace=namespace,
340
+ plural='workloads',
341
+ name=workload['metadata']['name'],
342
+ body=workload_patch,
343
+ )
344
+ break
345
+ except Exception:
346
+ # If workload reactivation fails, continue (JobSet resumption still worked)
347
+ pass
348
+
349
+ return response
350
+ except kube_client.api_exception() as e:
351
+ if e.status == 404:
352
+ raise JobNotFoundError(f'Job {job_name} not found in namespace {namespace}')
353
+ else:
354
+ raise e
355
+
356
+
357
+ def get_job(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
358
+ """Gets a specific job from a jobset by name and worker index
359
+
360
+ Args:
361
+ namespace: Namespace where job exists
362
+ job_name: Name of jobset containing the job
363
+ worker_id: Index of the worker job to get (defaults to 0)
364
+
365
+ Returns:
366
+ Job object if found
367
+ """
368
+ try:
369
+ # Get the job object using the job name
370
+ # pattern {jobset-name}-workers-0-{worker_id}
371
+ job_name = f'{job_name}-workers-0'
372
+ context = kubernetes_utils.get_current_kube_config_context_name()
373
+ response = kube_client.batch_api(context=context).read_namespaced_job(
374
+ name=job_name, namespace=namespace
375
+ )
376
+ return response
377
+ except kube_client.api_exception() as err:
378
+ try:
379
+ error_body = json.loads(err.body)
380
+ error_message = error_body.get('message', '')
381
+ logger.error(f'error getting job: {error_message}')
382
+ except json.JSONDecodeError:
383
+ error_message = str(err.body)
384
+ logger.error(f'error getting job: {error_message}')
385
+ else:
386
+ # Re-raise the exception if it's a different error
387
+ raise err
388
+ return None
389
+
390
+
391
+ def _parse_timestamp_filter(timestamp_str: str) -> datetime:
392
+ """Parse timestamp string into datetime object for filtering
393
+
394
+ Supported formats:
395
+ - "08/06/25 03:54PM" (full datetime)
396
+ - "08/06/25" (date only)
397
+ - "03:54PM" (time only, uses today's date)
398
+ """
399
+
400
+ # Try different formats
401
+ formats = [
402
+ '%m/%d/%y %I:%M%p', # 08/06/25 03:54PM (full datetime)
403
+ '%m/%d/%y', # 08/06/25 (date only)
404
+ '%I:%M%p', # 03:54PM (time only)
405
+ ]
406
+
407
+ for fmt in formats:
408
+ try:
409
+ dt = datetime.strptime(timestamp_str, fmt)
410
+
411
+ # Handle time-only format (add today's date)
412
+ if fmt == '%I:%M%p':
413
+ today = datetime.now().strftime('%m/%d/%y')
414
+ dt = datetime.strptime(f'{today} {timestamp_str}', '%m/%d/%y %I:%M%p')
415
+
416
+ # If no timezone info, assume local timezone and convert to UTC
417
+ if dt.tzinfo is None:
418
+ if fmt in ['%m/%d/%y %I:%M%p', '%I:%M%p']:
419
+ # For display format, convert from local time to UTC
420
+ # Get current local timezone offset
421
+ local_offset = time.timezone if not time.daylight else time.altzone
422
+ # Convert local time to UTC by adding the offset
423
+ # (since timezone is negative)
424
+ dt = dt.replace(tzinfo=timezone.utc) + timedelta(
425
+ seconds=abs(local_offset)
426
+ )
427
+ else:
428
+ # Handle date-only format (local midnight --> UTC)
429
+ local_tz = datetime.now().astimezone().tzinfo
430
+ return dt.replace(tzinfo=local_tz).astimezone(timezone.utc)
431
+ return dt
432
+ except ValueError:
433
+ continue
434
+
435
+ raise ValueError(
436
+ f"Unable to parse timestamp '{timestamp_str}'. "
437
+ f"Supported formats: '08/06/25 03:54PM', '08/06/25', '03:54PM'"
438
+ )
439
+
440
+
441
+ def _format_timestamp(timestamp: str) -> str:
442
+ """Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
443
+ # Parse UTC timestamp and convert to local time
444
+ dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
445
+ tzinfo=timezone.utc
446
+ )
447
+ dt_local = dt_utc.astimezone() # Convert to local timezone
448
+ return dt_local.strftime('%m/%d/%y %I:%M%p')
449
+
450
+
451
+ def _get_job_start_time(job: Dict[str, Any]) -> str:
452
+ status = job.get('status', {})
453
+ for condition in status.get('conditions', []):
454
+ if condition['reason'] == 'ResumeJobs':
455
+ return condition.get('lastTransitionTime', '')
456
+ return '-'
457
+
458
+
459
+ def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
460
+ """Extract end time from JobSet conditions (Completed or Failed)"""
461
+ conditions = job.get('status', {}).get('conditions', [])
462
+ for condition in conditions:
463
+ # Look for terminal conditions with status=True
464
+ if (
465
+ condition.get('type') in ['Completed', 'Failed']
466
+ and condition.get('status') == 'True'
467
+ ):
468
+ return condition.get('lastTransitionTime', '')
469
+ return '-'
470
+
471
+
472
+ def _get_time_delta(delta: 'timedelta') -> Tuple[str, 'timedelta']:
473
+ total_seconds = int(delta.total_seconds())
474
+
475
+ days, remainder = divmod(total_seconds, 86400) # 86400 seconds in a day
476
+ hours, remainder = divmod(remainder, 3600) # 3600 seconds in an hour
477
+ minutes, seconds = divmod(remainder, 60) # 60 seconds in a minute
478
+
479
+ days_str = f'{days} day{"s" if days != 1 else ""}, ' if days > 0 else ''
480
+ hours_str = f'{hours} hr{"s" if hours != 1 else ""}, ' if hours > 0 else ''
481
+ minutes_str = (
482
+ f'{minutes} min{"s" if minutes != 1 else ""}'
483
+ if minutes > 0 and days == 0
484
+ else ''
485
+ )
486
+
487
+ seconds_str = (
488
+ f'{seconds} sec{"s" if seconds != 1 else ""}'
489
+ if seconds > 0 and days == 0 and hours == 0 and minutes == 0
490
+ else ''
491
+ )
492
+
493
+ result = f'{days_str}{hours_str}{minutes_str}{seconds_str}'
494
+ return result if result else '<1 minute', delta
495
+
496
+
497
+ def _get_job_length(start_time: str, end_time: str) -> str:
498
+ if start_time == '-' or end_time == '-':
499
+ return '-'
500
+ else:
501
+ start = datetime.strptime(start_time, '%m/%d/%y %I:%M%p')
502
+ end = datetime.strptime(end_time, '%m/%d/%y %I:%M%p')
503
+ delta, _ = _get_time_delta(end - start)
504
+ return delta
505
+
506
+
507
+ def show_status_table(
508
+ namespace: str,
509
+ all_users: bool,
510
+ limit: Optional[int] = None,
511
+ after: Optional[str] = None,
512
+ before: Optional[str] = None,
513
+ ):
514
+ """Compute cluster table values and display with optional filtering and pagination.
515
+
516
+ Args:
517
+ namespace: Kubernetes namespace to search
518
+ all_users: Whether to show jobs from all users
519
+ limit: Maximum number of jobs to display
520
+ after: Show jobs created after this timestamp
521
+ before: Show jobs created before this timestamp
522
+ """
523
+ # TODO(zhwu): Update the information for autostop clusters.
524
+
525
+ def _get_status_string_colorized(
526
+ status: Dict[str, Any], job: Dict[str, Any]
527
+ ) -> str:
528
+ # Handle case where status might be empty or missing
529
+ if not status:
530
+ return (
531
+ f'{colorama.Fore.YELLOW}'
532
+ f'{JobStatus.PENDING.name}{colorama.Style.RESET_ALL}'
533
+ )
534
+
535
+ terminalState = status.get('terminalState', None)
536
+ if terminalState and terminalState.upper() == JobStatus.COMPLETED.name.upper():
537
+ return (
538
+ f'{colorama.Fore.GREEN}'
539
+ f'{JobStatus.COMPLETED.name}{colorama.Style.RESET_ALL}'
540
+ )
541
+ elif terminalState and terminalState.upper() == JobStatus.FAILED.name.upper():
542
+ return (
543
+ f'{colorama.Fore.RED}'
544
+ f'{JobStatus.FAILED.name}{colorama.Style.RESET_ALL}'
545
+ )
546
+ elif status.get('replicatedJobsStatus', [{}])[0].get('ready', False):
547
+ return (
548
+ f'{colorama.Fore.CYAN}'
549
+ f'{JobStatus.ACTIVE.name}{colorama.Style.RESET_ALL}'
550
+ )
551
+ elif status.get('replicatedJobsStatus', [{}])[0].get('suspended', False):
552
+ # Check if this was manually suspended
553
+ annotations = job.get('metadata', {}).get('annotations', {})
554
+ if annotations.get(backend_constants.STOP_USERID_LABEL):
555
+ username = annotations.get(
556
+ backend_constants.STOP_USERNAME_LABEL, 'unknown'
557
+ )
558
+ return (
559
+ f'{colorama.Fore.BLUE}'
560
+ f'{JobStatus.SUSPENDED.name} '
561
+ f'(by {username}){colorama.Style.RESET_ALL}'
562
+ )
563
+ else:
564
+ return (
565
+ f'{colorama.Fore.BLUE}'
566
+ f'{JobStatus.SUSPENDED.name} (by system){colorama.Style.RESET_ALL}'
567
+ )
568
+ else:
569
+ return (
570
+ f'{colorama.Fore.YELLOW}'
571
+ f'{JobStatus.PENDING.name}{colorama.Style.RESET_ALL}'
572
+ )
573
+
574
+ def _get_resources(job: Dict[str, Any]) -> str:
575
+ num_pods = int(
576
+ job['spec']['replicatedJobs'][0]['template']['spec']['parallelism']
577
+ ) # noqa: E501
578
+ resources = job['spec']['replicatedJobs'][0]['template']['spec']['template'][
579
+ 'spec'
580
+ ]['containers'][0]['resources']['limits'] # noqa: E501
581
+ cpu, memory = resources['cpu'], resources['memory']
582
+ accelerator = job['metadata']['labels'].get(JOBSET_ACCELERATOR_LABEL, None)
583
+ num_accelerators = job['metadata']['labels'].get(
584
+ JOBSET_NUM_ACCELERATORS_LABEL, None
585
+ )
586
+ if accelerator and accelerator != 'None':
587
+ if num_accelerators and num_accelerators != '0':
588
+ accelerator_with_count = f'{accelerator}:{num_accelerators}'
589
+ else:
590
+ accelerator_with_count = accelerator
591
+ return f'{num_pods}x({cpu}CPU, {memory}MEM, {accelerator_with_count})'
592
+ else:
593
+ return f'{num_pods}x({cpu}CPU, {memory}MEM)'
594
+
595
+ if all_users:
596
+ columns = [
597
+ 'NAME',
598
+ 'USER',
599
+ 'STATUS',
600
+ 'RESOURCES',
601
+ 'SUBMITTED',
602
+ 'START TIME',
603
+ 'END TIME',
604
+ 'DURATION',
605
+ ]
606
+ else:
607
+ columns = [
608
+ 'NAME',
609
+ 'STATUS',
610
+ 'RESOURCES',
611
+ 'SUBMITTED',
612
+ 'START TIME',
613
+ 'END TIME',
614
+ 'DURATION',
615
+ ]
616
+ job_table = log_utils.create_table(columns)
617
+ job_specs = list_jobset(namespace)
618
+ assert job_specs is not None, 'Retrieving jobs failed'
619
+
620
+ # Parse timestamp filters if provided
621
+ after_dt = None
622
+ before_dt = None
623
+ if after:
624
+ try:
625
+ after_dt = _parse_timestamp_filter(after)
626
+ except ValueError as e:
627
+ click.secho(f'Error parsing --after timestamp: {e}', fg='red', err=True)
628
+ return
629
+ if before:
630
+ try:
631
+ before_dt = _parse_timestamp_filter(before)
632
+ except ValueError as e:
633
+ click.secho(f'Error parsing --before timestamp: {e}', fg='red', err=True)
634
+ return
635
+
636
+ rows = []
637
+ for job in job_specs['items']:
638
+ # Apply timestamp filtering
639
+ if after_dt or before_dt:
640
+ job_creation_time = datetime.strptime(
641
+ job['metadata']['creationTimestamp'], '%Y-%m-%dT%H:%M:%SZ'
642
+ ).replace(tzinfo=timezone.utc)
643
+
644
+ if after_dt and job_creation_time <= after_dt:
645
+ continue
646
+ if before_dt and job_creation_time >= before_dt:
647
+ continue
648
+ # Get start time
649
+ start_time = _get_job_start_time(job)
650
+ if start_time != '-':
651
+ start_time = _format_timestamp(start_time)
652
+
653
+ # Get submitted time (how long ago)
654
+ time_delta = datetime.now(timezone.utc) - datetime.strptime(
655
+ job['metadata']['creationTimestamp'], '%Y-%m-%dT%H:%M:%SZ'
656
+ ).replace(tzinfo=timezone.utc)
657
+ submitted_time, _ = _get_time_delta(time_delta)
658
+
659
+ # Get end time (from JobSet conditions)
660
+ end_time = _get_end_time_from_conditions(job)
661
+ if end_time != '-':
662
+ end_time = _format_timestamp(end_time)
663
+
664
+ job_length = _get_job_length(start_time, end_time)
665
+
666
+ if all_users:
667
+ rows.append(
668
+ [
669
+ job['metadata']['name'],
670
+ job['metadata']['labels'][JOBSET_USERID_LABEL],
671
+ _get_status_string_colorized(job.get('status', {}), job),
672
+ _get_resources(job),
673
+ submitted_time,
674
+ start_time,
675
+ end_time,
676
+ job_length,
677
+ job['metadata']['creationTimestamp'],
678
+ ]
679
+ )
680
+ elif (
681
+ not all_users
682
+ and job['metadata']['labels'][JOBSET_USER_LABEL]
683
+ == common_utils.get_cleaned_username()
684
+ ):
685
+ rows.append(
686
+ [
687
+ job['metadata']['name'],
688
+ _get_status_string_colorized(job.get('status', {}), job),
689
+ _get_resources(job),
690
+ submitted_time,
691
+ start_time,
692
+ end_time,
693
+ job_length,
694
+ job['metadata']['creationTimestamp'],
695
+ ]
696
+ )
697
+
698
+ # Sort by creation timestamp (most recent first)
699
+ rows = sorted(rows, key=lambda x: x[-1], reverse=True)
700
+
701
+ # Apply limit if specified
702
+ if limit and limit > 0:
703
+ rows = rows[:limit]
704
+
705
+ # Show pagination info if applicable
706
+ total_jobs = len(job_specs['items'])
707
+ filtered_jobs = len(rows)
708
+
709
+ if limit or after or before:
710
+ filter_info = []
711
+ if after:
712
+ filter_info.append(f'after {after}')
713
+ if before:
714
+ filter_info.append(f'before {before}')
715
+ if limit:
716
+ filter_info.append(f'limit {limit}')
717
+
718
+ filter_str = ', '.join(filter_info)
719
+ click.secho(f'Showing {filtered_jobs} jobs ({filter_str})', fg='yellow')
720
+ if total_jobs != filtered_jobs:
721
+ click.secho(f'Total jobs in namespace: {total_jobs}', fg='yellow')
722
+
723
+ # Remove the sorting timestamp and add rows to table
724
+ for row in rows:
725
+ job_table.add_row(row[:-1])
726
+ print(job_table)