konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,726 @@
|
|
|
1
|
+
"""Jobset utils: wraps CRUD operations for jobsets"""
|
|
2
|
+
|
|
3
|
+
import enum
|
|
4
|
+
import json
|
|
5
|
+
import tempfile
|
|
6
|
+
import time
|
|
7
|
+
import typing
|
|
8
|
+
from datetime import datetime, timedelta, timezone
|
|
9
|
+
from typing import Any, Dict, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
import click
|
|
12
|
+
import colorama
|
|
13
|
+
|
|
14
|
+
import konduktor
|
|
15
|
+
from konduktor import kube_client, logging
|
|
16
|
+
from konduktor.backends import constants as backend_constants
|
|
17
|
+
from konduktor.backends import pod_utils
|
|
18
|
+
from konduktor.utils import (
|
|
19
|
+
common_utils,
|
|
20
|
+
kubernetes_utils,
|
|
21
|
+
log_utils,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
if typing.TYPE_CHECKING:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
logger = logging.get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
JOBSET_API_GROUP = 'jobset.x-k8s.io'
|
|
30
|
+
JOBSET_API_VERSION = 'v1alpha2'
|
|
31
|
+
JOBSET_PLURAL = 'jobsets'
|
|
32
|
+
|
|
33
|
+
# Use shared constants from konduktor.backends.constants
|
|
34
|
+
JOBSET_NAME_LABEL = backend_constants.JOB_NAME_LABEL
|
|
35
|
+
JOBSET_USERID_LABEL = backend_constants.USERID_LABEL
|
|
36
|
+
JOBSET_USER_LABEL = backend_constants.USER_LABEL
|
|
37
|
+
JOBSET_ACCELERATOR_LABEL = backend_constants.ACCELERATOR_LABEL
|
|
38
|
+
JOBSET_NUM_ACCELERATORS_LABEL = backend_constants.NUM_ACCELERATORS_LABEL
|
|
39
|
+
JOBSET_MAX_EXECUTION_TIME_LABEL = backend_constants.MAX_EXECUTION_TIME_LABEL
|
|
40
|
+
|
|
41
|
+
SECRET_BASENAME_LABEL = backend_constants.SECRET_BASENAME_LABEL
|
|
42
|
+
|
|
43
|
+
_JOBSET_METADATA_LABELS = {
|
|
44
|
+
'jobset_name_label': JOBSET_NAME_LABEL,
|
|
45
|
+
'jobset_userid_label': JOBSET_USERID_LABEL,
|
|
46
|
+
'jobset_user_label': JOBSET_USER_LABEL,
|
|
47
|
+
'jobset_accelerator_label': JOBSET_ACCELERATOR_LABEL,
|
|
48
|
+
'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
|
|
49
|
+
'jobset_max_execution_time_label': JOBSET_MAX_EXECUTION_TIME_LABEL,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class JobNotFoundError(Exception):
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class JobStatus(enum.Enum):
|
|
58
|
+
SUSPENDED = 'SUSPENDED'
|
|
59
|
+
ACTIVE = 'ACTIVE'
|
|
60
|
+
COMPLETED = 'COMPLETED'
|
|
61
|
+
FAILED = 'FAILED'
|
|
62
|
+
PENDING = 'PENDING'
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if typing.TYPE_CHECKING:
|
|
66
|
+
import konduktor
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def create_jobset(
|
|
70
|
+
namespace: str,
|
|
71
|
+
task: 'konduktor.Task',
|
|
72
|
+
pod_spec: Dict[str, Any],
|
|
73
|
+
dryrun: bool = False,
|
|
74
|
+
) -> Optional[Dict[str, Any]]:
|
|
75
|
+
"""Creates a jobset based on the task definition and pod spec
|
|
76
|
+
and returns the created jobset spec
|
|
77
|
+
"""
|
|
78
|
+
assert task.resources is not None, 'Task resources are undefined'
|
|
79
|
+
accelerator_type = task.resources.get_accelerator_type() or 'None'
|
|
80
|
+
num_accelerators = task.resources.get_accelerator_count() or 0
|
|
81
|
+
labels = task.resources.labels if task.resources.labels else {}
|
|
82
|
+
with tempfile.NamedTemporaryFile() as temp:
|
|
83
|
+
common_utils.fill_template(
|
|
84
|
+
'jobset.yaml.j2',
|
|
85
|
+
{
|
|
86
|
+
'job_name': task.name,
|
|
87
|
+
'user_id': common_utils.user_and_hostname_hash(),
|
|
88
|
+
'num_nodes': task.num_nodes,
|
|
89
|
+
'user': common_utils.get_cleaned_username(),
|
|
90
|
+
'accelerator_type': accelerator_type,
|
|
91
|
+
'num_accelerators': num_accelerators,
|
|
92
|
+
'completions': task.resources.get_completions(),
|
|
93
|
+
'max_restarts': task.resources.get_max_restarts(),
|
|
94
|
+
'max_execution_time': labels.get('maxRunDurationSeconds', None),
|
|
95
|
+
**_JOBSET_METADATA_LABELS,
|
|
96
|
+
},
|
|
97
|
+
temp.name,
|
|
98
|
+
)
|
|
99
|
+
jobset_spec = common_utils.read_yaml(temp.name)
|
|
100
|
+
# Inject JobSet metadata (labels and annotations)
|
|
101
|
+
pod_utils.inject_jobset_metadata(jobset_spec, task)
|
|
102
|
+
|
|
103
|
+
# Merge pod spec into JobSet template
|
|
104
|
+
pod_utils.merge_pod_into_jobset_template(jobset_spec, pod_spec)
|
|
105
|
+
try:
|
|
106
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
107
|
+
jobset = kube_client.crd_api(context=context).create_namespaced_custom_object(
|
|
108
|
+
group=JOBSET_API_GROUP,
|
|
109
|
+
version=JOBSET_API_VERSION,
|
|
110
|
+
namespace=namespace,
|
|
111
|
+
plural=JOBSET_PLURAL,
|
|
112
|
+
body=jobset_spec['jobset'],
|
|
113
|
+
dry_run='All' if dryrun else None,
|
|
114
|
+
)
|
|
115
|
+
logger.info(
|
|
116
|
+
f'task {colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
117
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{task.name}'
|
|
118
|
+
f'{colorama.Style.RESET_ALL} created in context '
|
|
119
|
+
f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}{context}'
|
|
120
|
+
f'{colorama.Style.RESET_ALL}, namespace '
|
|
121
|
+
f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}{namespace}'
|
|
122
|
+
f'{colorama.Style.RESET_ALL}'
|
|
123
|
+
)
|
|
124
|
+
return jobset
|
|
125
|
+
except kube_client.api_exception() as err:
|
|
126
|
+
try:
|
|
127
|
+
error_body = json.loads(err.body)
|
|
128
|
+
error_message = error_body.get('message', '')
|
|
129
|
+
logger.error(f'error creating jobset: {error_message}')
|
|
130
|
+
except json.JSONDecodeError:
|
|
131
|
+
error_message = str(err.body)
|
|
132
|
+
logger.error(f'error creating jobset: {error_message}')
|
|
133
|
+
else:
|
|
134
|
+
# Re-raise the exception if it's a different error
|
|
135
|
+
raise err
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def list_jobset(namespace: str) -> Optional[Dict[str, Any]]:
|
|
140
|
+
"""Lists all jobsets in this namespace"""
|
|
141
|
+
try:
|
|
142
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
143
|
+
response = kube_client.crd_api(context=context).list_namespaced_custom_object(
|
|
144
|
+
group=JOBSET_API_GROUP,
|
|
145
|
+
version=JOBSET_API_VERSION,
|
|
146
|
+
namespace=namespace,
|
|
147
|
+
plural=JOBSET_PLURAL,
|
|
148
|
+
)
|
|
149
|
+
return response
|
|
150
|
+
except kube_client.api_exception() as err:
|
|
151
|
+
try:
|
|
152
|
+
error_body = json.loads(err.body)
|
|
153
|
+
error_message = error_body.get('message', '')
|
|
154
|
+
logger.error(f'error listing jobset: {error_message}')
|
|
155
|
+
except json.JSONDecodeError:
|
|
156
|
+
error_message = str(err.body)
|
|
157
|
+
logger.error(f'error creating jobset: {error_message}')
|
|
158
|
+
else:
|
|
159
|
+
# Re-raise the exception if it's a different error
|
|
160
|
+
raise err
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def get_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
|
|
165
|
+
"""Retrieves jobset in this namespace"""
|
|
166
|
+
try:
|
|
167
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
168
|
+
response = kube_client.crd_api(context=context).get_namespaced_custom_object(
|
|
169
|
+
group=JOBSET_API_GROUP,
|
|
170
|
+
version=JOBSET_API_VERSION,
|
|
171
|
+
namespace=namespace,
|
|
172
|
+
plural=JOBSET_PLURAL,
|
|
173
|
+
name=job_name,
|
|
174
|
+
)
|
|
175
|
+
return response
|
|
176
|
+
except kube_client.api_exception() as err:
|
|
177
|
+
if err.status == 404:
|
|
178
|
+
raise JobNotFoundError(
|
|
179
|
+
f"Jobset '{job_name}' " f"not found in namespace '{namespace}'."
|
|
180
|
+
)
|
|
181
|
+
try:
|
|
182
|
+
error_body = json.loads(err.body)
|
|
183
|
+
error_message = error_body.get('message', '')
|
|
184
|
+
logger.error(f'error getting jobset: {error_message}')
|
|
185
|
+
except json.JSONDecodeError:
|
|
186
|
+
error_message = str(err.body)
|
|
187
|
+
logger.error(f'error creating jobset: {error_message}')
|
|
188
|
+
else:
|
|
189
|
+
# Re-raise the exception if it's a different error
|
|
190
|
+
raise err
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def delete_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
|
|
195
|
+
"""Deletes jobset in this namespace
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
namespace: Namespace where jobset exists
|
|
199
|
+
job_name: Name of jobset to delete
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Response from delete operation
|
|
203
|
+
"""
|
|
204
|
+
try:
|
|
205
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
206
|
+
response = kube_client.crd_api(context=context).delete_namespaced_custom_object(
|
|
207
|
+
group=JOBSET_API_GROUP,
|
|
208
|
+
version=JOBSET_API_VERSION,
|
|
209
|
+
namespace=namespace,
|
|
210
|
+
plural=JOBSET_PLURAL,
|
|
211
|
+
name=job_name,
|
|
212
|
+
)
|
|
213
|
+
return response
|
|
214
|
+
except kube_client.api_exception() as err:
|
|
215
|
+
try:
|
|
216
|
+
error_body = json.loads(err.body)
|
|
217
|
+
error_message = error_body.get('message', '')
|
|
218
|
+
logger.error(f'error deleting jobset: {error_message}')
|
|
219
|
+
except json.JSONDecodeError:
|
|
220
|
+
error_message = str(err.body)
|
|
221
|
+
logger.error(f'error deleting jobset: {error_message}')
|
|
222
|
+
else:
|
|
223
|
+
# Re-raise the exception if it's a different error
|
|
224
|
+
raise err
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def stop_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
|
|
229
|
+
"""Stops jobset in this namespace"""
|
|
230
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
231
|
+
try:
|
|
232
|
+
# First check if the job exists
|
|
233
|
+
get_jobset(namespace, job_name)
|
|
234
|
+
|
|
235
|
+
# Apply patch to suspend the jobset and add annotations
|
|
236
|
+
# Time is in UTC but gets converted to local timezone in the konduktor status UI
|
|
237
|
+
patch = {
|
|
238
|
+
'spec': {'suspend': True},
|
|
239
|
+
'metadata': {
|
|
240
|
+
'annotations': {
|
|
241
|
+
backend_constants.STOP_USERID_LABEL: (
|
|
242
|
+
common_utils.user_and_hostname_hash()
|
|
243
|
+
),
|
|
244
|
+
backend_constants.STOP_USERNAME_LABEL: (
|
|
245
|
+
common_utils.get_cleaned_username()
|
|
246
|
+
),
|
|
247
|
+
}
|
|
248
|
+
},
|
|
249
|
+
}
|
|
250
|
+
response = kube_client.crd_api(context=context).patch_namespaced_custom_object(
|
|
251
|
+
group=JOBSET_API_GROUP,
|
|
252
|
+
version=JOBSET_API_VERSION,
|
|
253
|
+
namespace=namespace,
|
|
254
|
+
plural=JOBSET_PLURAL,
|
|
255
|
+
name=job_name,
|
|
256
|
+
body=patch,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Also suspend the associated Kueue workload to prevent automatic resumption
|
|
260
|
+
try:
|
|
261
|
+
# Find the workload for this jobset
|
|
262
|
+
workloads = kube_client.crd_api(
|
|
263
|
+
context=context
|
|
264
|
+
).list_namespaced_custom_object(
|
|
265
|
+
group='kueue.x-k8s.io',
|
|
266
|
+
version='v1beta1',
|
|
267
|
+
namespace=namespace,
|
|
268
|
+
plural='workloads',
|
|
269
|
+
)
|
|
270
|
+
for workload in workloads.get('items', []):
|
|
271
|
+
if workload['metadata']['name'].startswith(f'jobset-{job_name}-'):
|
|
272
|
+
# Suspend the workload
|
|
273
|
+
workload_patch = {'spec': {'active': False}}
|
|
274
|
+
kube_client.crd_api(context=context).patch_namespaced_custom_object(
|
|
275
|
+
group='kueue.x-k8s.io',
|
|
276
|
+
version='v1beta1',
|
|
277
|
+
namespace=namespace,
|
|
278
|
+
plural='workloads',
|
|
279
|
+
name=workload['metadata']['name'],
|
|
280
|
+
body=workload_patch,
|
|
281
|
+
)
|
|
282
|
+
break
|
|
283
|
+
except Exception:
|
|
284
|
+
# If workload suspension fails, continue (JobSet suspension still worked)
|
|
285
|
+
pass
|
|
286
|
+
|
|
287
|
+
return response
|
|
288
|
+
except kube_client.api_exception() as e:
|
|
289
|
+
if e.status == 404:
|
|
290
|
+
raise JobNotFoundError(f'Job {job_name} not found in namespace {namespace}')
|
|
291
|
+
else:
|
|
292
|
+
raise e
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def start_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
|
|
296
|
+
"""Starts jobset in this namespace"""
|
|
297
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
298
|
+
try:
|
|
299
|
+
# First check if the job exists
|
|
300
|
+
get_jobset(namespace, job_name)
|
|
301
|
+
|
|
302
|
+
# Apply patch to resume the jobset and remove suspension annotations
|
|
303
|
+
patch = {
|
|
304
|
+
'spec': {'suspend': False},
|
|
305
|
+
'metadata': {
|
|
306
|
+
'annotations': {
|
|
307
|
+
backend_constants.STOP_USERID_LABEL: None,
|
|
308
|
+
backend_constants.STOP_USERNAME_LABEL: None,
|
|
309
|
+
}
|
|
310
|
+
},
|
|
311
|
+
}
|
|
312
|
+
response = kube_client.crd_api(context=context).patch_namespaced_custom_object(
|
|
313
|
+
group=JOBSET_API_GROUP,
|
|
314
|
+
version=JOBSET_API_VERSION,
|
|
315
|
+
namespace=namespace,
|
|
316
|
+
plural=JOBSET_PLURAL,
|
|
317
|
+
name=job_name,
|
|
318
|
+
body=patch,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Also reactivate the associated Kueue workload
|
|
322
|
+
try:
|
|
323
|
+
# Find the workload for this jobset
|
|
324
|
+
workloads = kube_client.crd_api(
|
|
325
|
+
context=context
|
|
326
|
+
).list_namespaced_custom_object(
|
|
327
|
+
group='kueue.x-k8s.io',
|
|
328
|
+
version='v1beta1',
|
|
329
|
+
namespace=namespace,
|
|
330
|
+
plural='workloads',
|
|
331
|
+
)
|
|
332
|
+
for workload in workloads.get('items', []):
|
|
333
|
+
if workload['metadata']['name'].startswith(f'jobset-{job_name}-'):
|
|
334
|
+
# Reactivate the workload
|
|
335
|
+
workload_patch = {'spec': {'active': True}}
|
|
336
|
+
kube_client.crd_api(context=context).patch_namespaced_custom_object(
|
|
337
|
+
group='kueue.x-k8s.io',
|
|
338
|
+
version='v1beta1',
|
|
339
|
+
namespace=namespace,
|
|
340
|
+
plural='workloads',
|
|
341
|
+
name=workload['metadata']['name'],
|
|
342
|
+
body=workload_patch,
|
|
343
|
+
)
|
|
344
|
+
break
|
|
345
|
+
except Exception:
|
|
346
|
+
# If workload reactivation fails, continue (JobSet resumption still worked)
|
|
347
|
+
pass
|
|
348
|
+
|
|
349
|
+
return response
|
|
350
|
+
except kube_client.api_exception() as e:
|
|
351
|
+
if e.status == 404:
|
|
352
|
+
raise JobNotFoundError(f'Job {job_name} not found in namespace {namespace}')
|
|
353
|
+
else:
|
|
354
|
+
raise e
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def get_job(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
|
|
358
|
+
"""Gets a specific job from a jobset by name and worker index
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
namespace: Namespace where job exists
|
|
362
|
+
job_name: Name of jobset containing the job
|
|
363
|
+
worker_id: Index of the worker job to get (defaults to 0)
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
Job object if found
|
|
367
|
+
"""
|
|
368
|
+
try:
|
|
369
|
+
# Get the job object using the job name
|
|
370
|
+
# pattern {jobset-name}-workers-0-{worker_id}
|
|
371
|
+
job_name = f'{job_name}-workers-0'
|
|
372
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
373
|
+
response = kube_client.batch_api(context=context).read_namespaced_job(
|
|
374
|
+
name=job_name, namespace=namespace
|
|
375
|
+
)
|
|
376
|
+
return response
|
|
377
|
+
except kube_client.api_exception() as err:
|
|
378
|
+
try:
|
|
379
|
+
error_body = json.loads(err.body)
|
|
380
|
+
error_message = error_body.get('message', '')
|
|
381
|
+
logger.error(f'error getting job: {error_message}')
|
|
382
|
+
except json.JSONDecodeError:
|
|
383
|
+
error_message = str(err.body)
|
|
384
|
+
logger.error(f'error getting job: {error_message}')
|
|
385
|
+
else:
|
|
386
|
+
# Re-raise the exception if it's a different error
|
|
387
|
+
raise err
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def _parse_timestamp_filter(timestamp_str: str) -> datetime:
|
|
392
|
+
"""Parse timestamp string into datetime object for filtering
|
|
393
|
+
|
|
394
|
+
Supported formats:
|
|
395
|
+
- "08/06/25 03:54PM" (full datetime)
|
|
396
|
+
- "08/06/25" (date only)
|
|
397
|
+
- "03:54PM" (time only, uses today's date)
|
|
398
|
+
"""
|
|
399
|
+
|
|
400
|
+
# Try different formats
|
|
401
|
+
formats = [
|
|
402
|
+
'%m/%d/%y %I:%M%p', # 08/06/25 03:54PM (full datetime)
|
|
403
|
+
'%m/%d/%y', # 08/06/25 (date only)
|
|
404
|
+
'%I:%M%p', # 03:54PM (time only)
|
|
405
|
+
]
|
|
406
|
+
|
|
407
|
+
for fmt in formats:
|
|
408
|
+
try:
|
|
409
|
+
dt = datetime.strptime(timestamp_str, fmt)
|
|
410
|
+
|
|
411
|
+
# Handle time-only format (add today's date)
|
|
412
|
+
if fmt == '%I:%M%p':
|
|
413
|
+
today = datetime.now().strftime('%m/%d/%y')
|
|
414
|
+
dt = datetime.strptime(f'{today} {timestamp_str}', '%m/%d/%y %I:%M%p')
|
|
415
|
+
|
|
416
|
+
# If no timezone info, assume local timezone and convert to UTC
|
|
417
|
+
if dt.tzinfo is None:
|
|
418
|
+
if fmt in ['%m/%d/%y %I:%M%p', '%I:%M%p']:
|
|
419
|
+
# For display format, convert from local time to UTC
|
|
420
|
+
# Get current local timezone offset
|
|
421
|
+
local_offset = time.timezone if not time.daylight else time.altzone
|
|
422
|
+
# Convert local time to UTC by adding the offset
|
|
423
|
+
# (since timezone is negative)
|
|
424
|
+
dt = dt.replace(tzinfo=timezone.utc) + timedelta(
|
|
425
|
+
seconds=abs(local_offset)
|
|
426
|
+
)
|
|
427
|
+
else:
|
|
428
|
+
# Handle date-only format (local midnight --> UTC)
|
|
429
|
+
local_tz = datetime.now().astimezone().tzinfo
|
|
430
|
+
return dt.replace(tzinfo=local_tz).astimezone(timezone.utc)
|
|
431
|
+
return dt
|
|
432
|
+
except ValueError:
|
|
433
|
+
continue
|
|
434
|
+
|
|
435
|
+
raise ValueError(
|
|
436
|
+
f"Unable to parse timestamp '{timestamp_str}'. "
|
|
437
|
+
f"Supported formats: '08/06/25 03:54PM', '08/06/25', '03:54PM'"
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def _format_timestamp(timestamp: str) -> str:
|
|
442
|
+
"""Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
|
|
443
|
+
# Parse UTC timestamp and convert to local time
|
|
444
|
+
dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
|
|
445
|
+
tzinfo=timezone.utc
|
|
446
|
+
)
|
|
447
|
+
dt_local = dt_utc.astimezone() # Convert to local timezone
|
|
448
|
+
return dt_local.strftime('%m/%d/%y %I:%M%p')
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _get_job_start_time(job: Dict[str, Any]) -> str:
|
|
452
|
+
status = job.get('status', {})
|
|
453
|
+
for condition in status.get('conditions', []):
|
|
454
|
+
if condition['reason'] == 'ResumeJobs':
|
|
455
|
+
return condition.get('lastTransitionTime', '')
|
|
456
|
+
return '-'
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
|
|
460
|
+
"""Extract end time from JobSet conditions (Completed or Failed)"""
|
|
461
|
+
conditions = job.get('status', {}).get('conditions', [])
|
|
462
|
+
for condition in conditions:
|
|
463
|
+
# Look for terminal conditions with status=True
|
|
464
|
+
if (
|
|
465
|
+
condition.get('type') in ['Completed', 'Failed']
|
|
466
|
+
and condition.get('status') == 'True'
|
|
467
|
+
):
|
|
468
|
+
return condition.get('lastTransitionTime', '')
|
|
469
|
+
return '-'
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def _get_time_delta(delta: 'timedelta') -> Tuple[str, 'timedelta']:
|
|
473
|
+
total_seconds = int(delta.total_seconds())
|
|
474
|
+
|
|
475
|
+
days, remainder = divmod(total_seconds, 86400) # 86400 seconds in a day
|
|
476
|
+
hours, remainder = divmod(remainder, 3600) # 3600 seconds in an hour
|
|
477
|
+
minutes, seconds = divmod(remainder, 60) # 60 seconds in a minute
|
|
478
|
+
|
|
479
|
+
days_str = f'{days} day{"s" if days != 1 else ""}, ' if days > 0 else ''
|
|
480
|
+
hours_str = f'{hours} hr{"s" if hours != 1 else ""}, ' if hours > 0 else ''
|
|
481
|
+
minutes_str = (
|
|
482
|
+
f'{minutes} min{"s" if minutes != 1 else ""}'
|
|
483
|
+
if minutes > 0 and days == 0
|
|
484
|
+
else ''
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
seconds_str = (
|
|
488
|
+
f'{seconds} sec{"s" if seconds != 1 else ""}'
|
|
489
|
+
if seconds > 0 and days == 0 and hours == 0 and minutes == 0
|
|
490
|
+
else ''
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
result = f'{days_str}{hours_str}{minutes_str}{seconds_str}'
|
|
494
|
+
return result if result else '<1 minute', delta
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def _get_job_length(start_time: str, end_time: str) -> str:
|
|
498
|
+
if start_time == '-' or end_time == '-':
|
|
499
|
+
return '-'
|
|
500
|
+
else:
|
|
501
|
+
start = datetime.strptime(start_time, '%m/%d/%y %I:%M%p')
|
|
502
|
+
end = datetime.strptime(end_time, '%m/%d/%y %I:%M%p')
|
|
503
|
+
delta, _ = _get_time_delta(end - start)
|
|
504
|
+
return delta
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def show_status_table(
|
|
508
|
+
namespace: str,
|
|
509
|
+
all_users: bool,
|
|
510
|
+
limit: Optional[int] = None,
|
|
511
|
+
after: Optional[str] = None,
|
|
512
|
+
before: Optional[str] = None,
|
|
513
|
+
):
|
|
514
|
+
"""Compute cluster table values and display with optional filtering and pagination.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
namespace: Kubernetes namespace to search
|
|
518
|
+
all_users: Whether to show jobs from all users
|
|
519
|
+
limit: Maximum number of jobs to display
|
|
520
|
+
after: Show jobs created after this timestamp
|
|
521
|
+
before: Show jobs created before this timestamp
|
|
522
|
+
"""
|
|
523
|
+
# TODO(zhwu): Update the information for autostop clusters.
|
|
524
|
+
|
|
525
|
+
def _get_status_string_colorized(
|
|
526
|
+
status: Dict[str, Any], job: Dict[str, Any]
|
|
527
|
+
) -> str:
|
|
528
|
+
# Handle case where status might be empty or missing
|
|
529
|
+
if not status:
|
|
530
|
+
return (
|
|
531
|
+
f'{colorama.Fore.YELLOW}'
|
|
532
|
+
f'{JobStatus.PENDING.name}{colorama.Style.RESET_ALL}'
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
terminalState = status.get('terminalState', None)
|
|
536
|
+
if terminalState and terminalState.upper() == JobStatus.COMPLETED.name.upper():
|
|
537
|
+
return (
|
|
538
|
+
f'{colorama.Fore.GREEN}'
|
|
539
|
+
f'{JobStatus.COMPLETED.name}{colorama.Style.RESET_ALL}'
|
|
540
|
+
)
|
|
541
|
+
elif terminalState and terminalState.upper() == JobStatus.FAILED.name.upper():
|
|
542
|
+
return (
|
|
543
|
+
f'{colorama.Fore.RED}'
|
|
544
|
+
f'{JobStatus.FAILED.name}{colorama.Style.RESET_ALL}'
|
|
545
|
+
)
|
|
546
|
+
elif status.get('replicatedJobsStatus', [{}])[0].get('ready', False):
|
|
547
|
+
return (
|
|
548
|
+
f'{colorama.Fore.CYAN}'
|
|
549
|
+
f'{JobStatus.ACTIVE.name}{colorama.Style.RESET_ALL}'
|
|
550
|
+
)
|
|
551
|
+
elif status.get('replicatedJobsStatus', [{}])[0].get('suspended', False):
|
|
552
|
+
# Check if this was manually suspended
|
|
553
|
+
annotations = job.get('metadata', {}).get('annotations', {})
|
|
554
|
+
if annotations.get(backend_constants.STOP_USERID_LABEL):
|
|
555
|
+
username = annotations.get(
|
|
556
|
+
backend_constants.STOP_USERNAME_LABEL, 'unknown'
|
|
557
|
+
)
|
|
558
|
+
return (
|
|
559
|
+
f'{colorama.Fore.BLUE}'
|
|
560
|
+
f'{JobStatus.SUSPENDED.name} '
|
|
561
|
+
f'(by {username}){colorama.Style.RESET_ALL}'
|
|
562
|
+
)
|
|
563
|
+
else:
|
|
564
|
+
return (
|
|
565
|
+
f'{colorama.Fore.BLUE}'
|
|
566
|
+
f'{JobStatus.SUSPENDED.name} (by system){colorama.Style.RESET_ALL}'
|
|
567
|
+
)
|
|
568
|
+
else:
|
|
569
|
+
return (
|
|
570
|
+
f'{colorama.Fore.YELLOW}'
|
|
571
|
+
f'{JobStatus.PENDING.name}{colorama.Style.RESET_ALL}'
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
def _get_resources(job: Dict[str, Any]) -> str:
|
|
575
|
+
num_pods = int(
|
|
576
|
+
job['spec']['replicatedJobs'][0]['template']['spec']['parallelism']
|
|
577
|
+
) # noqa: E501
|
|
578
|
+
resources = job['spec']['replicatedJobs'][0]['template']['spec']['template'][
|
|
579
|
+
'spec'
|
|
580
|
+
]['containers'][0]['resources']['limits'] # noqa: E501
|
|
581
|
+
cpu, memory = resources['cpu'], resources['memory']
|
|
582
|
+
accelerator = job['metadata']['labels'].get(JOBSET_ACCELERATOR_LABEL, None)
|
|
583
|
+
num_accelerators = job['metadata']['labels'].get(
|
|
584
|
+
JOBSET_NUM_ACCELERATORS_LABEL, None
|
|
585
|
+
)
|
|
586
|
+
if accelerator and accelerator != 'None':
|
|
587
|
+
if num_accelerators and num_accelerators != '0':
|
|
588
|
+
accelerator_with_count = f'{accelerator}:{num_accelerators}'
|
|
589
|
+
else:
|
|
590
|
+
accelerator_with_count = accelerator
|
|
591
|
+
return f'{num_pods}x({cpu}CPU, {memory}MEM, {accelerator_with_count})'
|
|
592
|
+
else:
|
|
593
|
+
return f'{num_pods}x({cpu}CPU, {memory}MEM)'
|
|
594
|
+
|
|
595
|
+
if all_users:
|
|
596
|
+
columns = [
|
|
597
|
+
'NAME',
|
|
598
|
+
'USER',
|
|
599
|
+
'STATUS',
|
|
600
|
+
'RESOURCES',
|
|
601
|
+
'SUBMITTED',
|
|
602
|
+
'START TIME',
|
|
603
|
+
'END TIME',
|
|
604
|
+
'DURATION',
|
|
605
|
+
]
|
|
606
|
+
else:
|
|
607
|
+
columns = [
|
|
608
|
+
'NAME',
|
|
609
|
+
'STATUS',
|
|
610
|
+
'RESOURCES',
|
|
611
|
+
'SUBMITTED',
|
|
612
|
+
'START TIME',
|
|
613
|
+
'END TIME',
|
|
614
|
+
'DURATION',
|
|
615
|
+
]
|
|
616
|
+
job_table = log_utils.create_table(columns)
|
|
617
|
+
job_specs = list_jobset(namespace)
|
|
618
|
+
assert job_specs is not None, 'Retrieving jobs failed'
|
|
619
|
+
|
|
620
|
+
# Parse timestamp filters if provided
|
|
621
|
+
after_dt = None
|
|
622
|
+
before_dt = None
|
|
623
|
+
if after:
|
|
624
|
+
try:
|
|
625
|
+
after_dt = _parse_timestamp_filter(after)
|
|
626
|
+
except ValueError as e:
|
|
627
|
+
click.secho(f'Error parsing --after timestamp: {e}', fg='red', err=True)
|
|
628
|
+
return
|
|
629
|
+
if before:
|
|
630
|
+
try:
|
|
631
|
+
before_dt = _parse_timestamp_filter(before)
|
|
632
|
+
except ValueError as e:
|
|
633
|
+
click.secho(f'Error parsing --before timestamp: {e}', fg='red', err=True)
|
|
634
|
+
return
|
|
635
|
+
|
|
636
|
+
rows = []
|
|
637
|
+
for job in job_specs['items']:
|
|
638
|
+
# Apply timestamp filtering
|
|
639
|
+
if after_dt or before_dt:
|
|
640
|
+
job_creation_time = datetime.strptime(
|
|
641
|
+
job['metadata']['creationTimestamp'], '%Y-%m-%dT%H:%M:%SZ'
|
|
642
|
+
).replace(tzinfo=timezone.utc)
|
|
643
|
+
|
|
644
|
+
if after_dt and job_creation_time <= after_dt:
|
|
645
|
+
continue
|
|
646
|
+
if before_dt and job_creation_time >= before_dt:
|
|
647
|
+
continue
|
|
648
|
+
# Get start time
|
|
649
|
+
start_time = _get_job_start_time(job)
|
|
650
|
+
if start_time != '-':
|
|
651
|
+
start_time = _format_timestamp(start_time)
|
|
652
|
+
|
|
653
|
+
# Get submitted time (how long ago)
|
|
654
|
+
time_delta = datetime.now(timezone.utc) - datetime.strptime(
|
|
655
|
+
job['metadata']['creationTimestamp'], '%Y-%m-%dT%H:%M:%SZ'
|
|
656
|
+
).replace(tzinfo=timezone.utc)
|
|
657
|
+
submitted_time, _ = _get_time_delta(time_delta)
|
|
658
|
+
|
|
659
|
+
# Get end time (from JobSet conditions)
|
|
660
|
+
end_time = _get_end_time_from_conditions(job)
|
|
661
|
+
if end_time != '-':
|
|
662
|
+
end_time = _format_timestamp(end_time)
|
|
663
|
+
|
|
664
|
+
job_length = _get_job_length(start_time, end_time)
|
|
665
|
+
|
|
666
|
+
if all_users:
|
|
667
|
+
rows.append(
|
|
668
|
+
[
|
|
669
|
+
job['metadata']['name'],
|
|
670
|
+
job['metadata']['labels'][JOBSET_USERID_LABEL],
|
|
671
|
+
_get_status_string_colorized(job.get('status', {}), job),
|
|
672
|
+
_get_resources(job),
|
|
673
|
+
submitted_time,
|
|
674
|
+
start_time,
|
|
675
|
+
end_time,
|
|
676
|
+
job_length,
|
|
677
|
+
job['metadata']['creationTimestamp'],
|
|
678
|
+
]
|
|
679
|
+
)
|
|
680
|
+
elif (
|
|
681
|
+
not all_users
|
|
682
|
+
and job['metadata']['labels'][JOBSET_USER_LABEL]
|
|
683
|
+
== common_utils.get_cleaned_username()
|
|
684
|
+
):
|
|
685
|
+
rows.append(
|
|
686
|
+
[
|
|
687
|
+
job['metadata']['name'],
|
|
688
|
+
_get_status_string_colorized(job.get('status', {}), job),
|
|
689
|
+
_get_resources(job),
|
|
690
|
+
submitted_time,
|
|
691
|
+
start_time,
|
|
692
|
+
end_time,
|
|
693
|
+
job_length,
|
|
694
|
+
job['metadata']['creationTimestamp'],
|
|
695
|
+
]
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
# Sort by creation timestamp (most recent first)
|
|
699
|
+
rows = sorted(rows, key=lambda x: x[-1], reverse=True)
|
|
700
|
+
|
|
701
|
+
# Apply limit if specified
|
|
702
|
+
if limit and limit > 0:
|
|
703
|
+
rows = rows[:limit]
|
|
704
|
+
|
|
705
|
+
# Show pagination info if applicable
|
|
706
|
+
total_jobs = len(job_specs['items'])
|
|
707
|
+
filtered_jobs = len(rows)
|
|
708
|
+
|
|
709
|
+
if limit or after or before:
|
|
710
|
+
filter_info = []
|
|
711
|
+
if after:
|
|
712
|
+
filter_info.append(f'after {after}')
|
|
713
|
+
if before:
|
|
714
|
+
filter_info.append(f'before {before}')
|
|
715
|
+
if limit:
|
|
716
|
+
filter_info.append(f'limit {limit}')
|
|
717
|
+
|
|
718
|
+
filter_str = ', '.join(filter_info)
|
|
719
|
+
click.secho(f'Showing {filtered_jobs} jobs ({filter_str})', fg='yellow')
|
|
720
|
+
if total_jobs != filtered_jobs:
|
|
721
|
+
click.secho(f'Total jobs in namespace: {total_jobs}', fg='yellow')
|
|
722
|
+
|
|
723
|
+
# Remove the sorting timestamp and add rows to table
|
|
724
|
+
for row in rows:
|
|
725
|
+
job_table.add_row(row[:-1])
|
|
726
|
+
print(job_table)
|