konduktor-nightly 0.1.0.dev20250808105243__py3-none-any.whl → 0.1.0.dev20250810104857__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of konduktor-nightly might be problematic. Click here for more details.
- konduktor/__init__.py +2 -2
- konduktor/backends/constants.py +7 -1
- konduktor/backends/jobset_utils.py +339 -28
- konduktor/backends/pod_utils.py +1 -1
- konduktor/cli.py +353 -18
- konduktor/config.py +1 -1
- konduktor/data/aws/s3.py +2 -1
- konduktor/data/gcp/gcs.py +2 -1
- konduktor/task.py +1 -1
- konduktor/templates/jobset.yaml.j2 +1 -1
- konduktor/utils/common_utils.py +1 -1
- konduktor/utils/kubernetes_utils.py +5 -4
- {konduktor_nightly-0.1.0.dev20250808105243.dist-info → konduktor_nightly-0.1.0.dev20250810104857.dist-info}/METADATA +1 -1
- {konduktor_nightly-0.1.0.dev20250808105243.dist-info → konduktor_nightly-0.1.0.dev20250810104857.dist-info}/RECORD +17 -17
- {konduktor_nightly-0.1.0.dev20250808105243.dist-info → konduktor_nightly-0.1.0.dev20250810104857.dist-info}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20250808105243.dist-info → konduktor_nightly-0.1.0.dev20250810104857.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250808105243.dist-info → konduktor_nightly-0.1.0.dev20250810104857.dist-info}/entry_points.txt +0 -0
konduktor/__init__.py
CHANGED
|
@@ -11,7 +11,7 @@ from konduktor.task import Task
|
|
|
11
11
|
__all__ = ['launch', 'Resources', 'Task', 'Serving']
|
|
12
12
|
|
|
13
13
|
# Replaced with the current commit when building the wheels.
|
|
14
|
-
_KONDUKTOR_COMMIT_SHA = '
|
|
14
|
+
_KONDUKTOR_COMMIT_SHA = '0f0b36c3a67aa7c60d6cb33240631b7c8ccaed03'
|
|
15
15
|
os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
|
|
16
16
|
|
|
17
17
|
|
|
@@ -45,5 +45,5 @@ def _get_git_commit():
|
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
__commit__ = _get_git_commit()
|
|
48
|
-
__version__ = '1.0.0.dev0.1.0.
|
|
48
|
+
__version__ = '1.0.0.dev0.1.0.dev20250810104857'
|
|
49
49
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
konduktor/backends/constants.py
CHANGED
|
@@ -9,5 +9,11 @@ USER_LABEL = 'trainy.ai/username'
|
|
|
9
9
|
ACCELERATOR_LABEL = 'trainy.ai/accelerator'
|
|
10
10
|
NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
|
|
11
11
|
|
|
12
|
+
# Start/stop/status related labels
|
|
13
|
+
STOP_USERID_LABEL = 'trainy.ai/stop-userid'
|
|
14
|
+
STOP_USERNAME_LABEL = 'trainy.ai/stop-username'
|
|
15
|
+
|
|
12
16
|
# Secret labels
|
|
13
|
-
SECRET_BASENAME_LABEL = '
|
|
17
|
+
SECRET_BASENAME_LABEL = 'trainy.ai/secret-basename'
|
|
18
|
+
SECRET_KIND_LABEL = 'trainy.ai/secret-kind'
|
|
19
|
+
SECRET_OWNER_LABEL = 'trainy.ai/secret-owner'
|
|
@@ -3,10 +3,12 @@
|
|
|
3
3
|
import enum
|
|
4
4
|
import json
|
|
5
5
|
import tempfile
|
|
6
|
+
import time
|
|
6
7
|
import typing
|
|
7
8
|
from datetime import datetime, timezone
|
|
8
9
|
from typing import Any, Dict, Optional, Tuple
|
|
9
10
|
|
|
11
|
+
import click
|
|
10
12
|
import colorama
|
|
11
13
|
|
|
12
14
|
if typing.TYPE_CHECKING:
|
|
@@ -217,6 +219,131 @@ def delete_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
|
|
|
217
219
|
return None
|
|
218
220
|
|
|
219
221
|
|
|
222
|
+
def stop_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
|
|
223
|
+
"""Stops jobset in this namespace"""
|
|
224
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
225
|
+
try:
|
|
226
|
+
# First check if the job exists
|
|
227
|
+
get_jobset(namespace, job_name)
|
|
228
|
+
|
|
229
|
+
# Apply patch to suspend the jobset and add annotations
|
|
230
|
+
# Time is in UTC but gets converted to local timezone in the konduktor status UI
|
|
231
|
+
patch = {
|
|
232
|
+
'spec': {'suspend': True},
|
|
233
|
+
'metadata': {
|
|
234
|
+
'annotations': {
|
|
235
|
+
backend_constants.STOP_USERID_LABEL: common_utils.user_and_hostname_hash(),
|
|
236
|
+
backend_constants.STOP_USERNAME_LABEL: common_utils.get_cleaned_username(),
|
|
237
|
+
}
|
|
238
|
+
},
|
|
239
|
+
}
|
|
240
|
+
response = kube_client.crd_api(context=context).patch_namespaced_custom_object(
|
|
241
|
+
group=JOBSET_API_GROUP,
|
|
242
|
+
version=JOBSET_API_VERSION,
|
|
243
|
+
namespace=namespace,
|
|
244
|
+
plural=JOBSET_PLURAL,
|
|
245
|
+
name=job_name,
|
|
246
|
+
body=patch,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Also suspend the associated Kueue workload to prevent automatic resumption
|
|
250
|
+
try:
|
|
251
|
+
# Find the workload for this jobset
|
|
252
|
+
workloads = kube_client.crd_api(
|
|
253
|
+
context=context
|
|
254
|
+
).list_namespaced_custom_object(
|
|
255
|
+
group='kueue.x-k8s.io',
|
|
256
|
+
version='v1beta1',
|
|
257
|
+
namespace=namespace,
|
|
258
|
+
plural='workloads',
|
|
259
|
+
)
|
|
260
|
+
for workload in workloads.get('items', []):
|
|
261
|
+
if workload['metadata']['name'].startswith(f'jobset-{job_name}-'):
|
|
262
|
+
# Suspend the workload
|
|
263
|
+
workload_patch = {'spec': {'active': False}}
|
|
264
|
+
kube_client.crd_api(context=context).patch_namespaced_custom_object(
|
|
265
|
+
group='kueue.x-k8s.io',
|
|
266
|
+
version='v1beta1',
|
|
267
|
+
namespace=namespace,
|
|
268
|
+
plural='workloads',
|
|
269
|
+
name=workload['metadata']['name'],
|
|
270
|
+
body=workload_patch,
|
|
271
|
+
)
|
|
272
|
+
break
|
|
273
|
+
except Exception:
|
|
274
|
+
# If workload suspension fails, continue (JobSet suspension still worked)
|
|
275
|
+
pass
|
|
276
|
+
|
|
277
|
+
return response
|
|
278
|
+
except kube_client.api_exception() as e:
|
|
279
|
+
if e.status == 404:
|
|
280
|
+
raise JobNotFoundError(f'Job {job_name} not found in namespace {namespace}')
|
|
281
|
+
else:
|
|
282
|
+
raise e
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def start_jobset(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
|
|
286
|
+
"""Starts jobset in this namespace"""
|
|
287
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
288
|
+
try:
|
|
289
|
+
# First check if the job exists
|
|
290
|
+
get_jobset(namespace, job_name)
|
|
291
|
+
|
|
292
|
+
# Apply patch to resume the jobset and remove suspension annotations
|
|
293
|
+
patch = {
|
|
294
|
+
'spec': {'suspend': False},
|
|
295
|
+
'metadata': {
|
|
296
|
+
'annotations': {
|
|
297
|
+
backend_constants.STOP_USERID_LABEL: None,
|
|
298
|
+
backend_constants.STOP_USERNAME_LABEL: None,
|
|
299
|
+
}
|
|
300
|
+
},
|
|
301
|
+
}
|
|
302
|
+
response = kube_client.crd_api(context=context).patch_namespaced_custom_object(
|
|
303
|
+
group=JOBSET_API_GROUP,
|
|
304
|
+
version=JOBSET_API_VERSION,
|
|
305
|
+
namespace=namespace,
|
|
306
|
+
plural=JOBSET_PLURAL,
|
|
307
|
+
name=job_name,
|
|
308
|
+
body=patch,
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
# Also reactivate the associated Kueue workload
|
|
312
|
+
try:
|
|
313
|
+
# Find the workload for this jobset
|
|
314
|
+
workloads = kube_client.crd_api(
|
|
315
|
+
context=context
|
|
316
|
+
).list_namespaced_custom_object(
|
|
317
|
+
group='kueue.x-k8s.io',
|
|
318
|
+
version='v1beta1',
|
|
319
|
+
namespace=namespace,
|
|
320
|
+
plural='workloads',
|
|
321
|
+
)
|
|
322
|
+
for workload in workloads.get('items', []):
|
|
323
|
+
if workload['metadata']['name'].startswith(f'jobset-{job_name}-'):
|
|
324
|
+
# Reactivate the workload
|
|
325
|
+
workload_patch = {'spec': {'active': True}}
|
|
326
|
+
kube_client.crd_api(context=context).patch_namespaced_custom_object(
|
|
327
|
+
group='kueue.x-k8s.io',
|
|
328
|
+
version='v1beta1',
|
|
329
|
+
namespace=namespace,
|
|
330
|
+
plural='workloads',
|
|
331
|
+
name=workload['metadata']['name'],
|
|
332
|
+
body=workload_patch,
|
|
333
|
+
)
|
|
334
|
+
break
|
|
335
|
+
except Exception:
|
|
336
|
+
# If workload reactivation fails, continue (JobSet resumption still worked)
|
|
337
|
+
pass
|
|
338
|
+
|
|
339
|
+
return response
|
|
340
|
+
except kube_client.api_exception() as e:
|
|
341
|
+
if e.status == 404:
|
|
342
|
+
raise JobNotFoundError(f'Job {job_name} not found in namespace {namespace}')
|
|
343
|
+
else:
|
|
344
|
+
raise e
|
|
345
|
+
|
|
346
|
+
|
|
220
347
|
def get_job(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
|
|
221
348
|
"""Gets a specific job from a jobset by name and worker index
|
|
222
349
|
|
|
@@ -251,16 +378,82 @@ def get_job(namespace: str, job_name: str) -> Optional[Dict[str, Any]]:
|
|
|
251
378
|
return None
|
|
252
379
|
|
|
253
380
|
|
|
254
|
-
def
|
|
255
|
-
"""
|
|
381
|
+
def _parse_timestamp_filter(timestamp_str: str) -> datetime:
|
|
382
|
+
"""Parse timestamp string into datetime object for filtering
|
|
256
383
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
384
|
+
Supported formats:
|
|
385
|
+
- "08/06/25 03:54PM" (full datetime)
|
|
386
|
+
- "08/06/25" (date only)
|
|
387
|
+
- "03:54PM" (time only, uses today's date)
|
|
388
|
+
"""
|
|
389
|
+
|
|
390
|
+
# Try different formats
|
|
391
|
+
formats = [
|
|
392
|
+
'%m/%d/%y %I:%M%p', # 08/06/25 03:54PM (full datetime)
|
|
393
|
+
'%m/%d/%y', # 08/06/25 (date only)
|
|
394
|
+
'%I:%M%p', # 03:54PM (time only)
|
|
395
|
+
]
|
|
396
|
+
|
|
397
|
+
for fmt in formats:
|
|
398
|
+
try:
|
|
399
|
+
dt = datetime.strptime(timestamp_str, fmt)
|
|
400
|
+
|
|
401
|
+
# Handle time-only format (add today's date)
|
|
402
|
+
if fmt == '%I:%M%p':
|
|
403
|
+
today = datetime.now().strftime('%m/%d/%y')
|
|
404
|
+
dt = datetime.strptime(f'{today} {timestamp_str}', '%m/%d/%y %I:%M%p')
|
|
405
|
+
|
|
406
|
+
# If no timezone info, assume local timezone and convert to UTC
|
|
407
|
+
if dt.tzinfo is None:
|
|
408
|
+
if fmt in ['%m/%d/%y %I:%M%p', '%I:%M%p']:
|
|
409
|
+
# For display format, convert from local time to UTC
|
|
410
|
+
# Get current local timezone offset
|
|
411
|
+
local_offset = time.timezone if not time.daylight else time.altzone
|
|
412
|
+
# Convert local time to UTC by adding the offset
|
|
413
|
+
# (since timezone is negative)
|
|
414
|
+
dt = dt.replace(tzinfo=timezone.utc) + timedelta(
|
|
415
|
+
seconds=abs(local_offset)
|
|
416
|
+
)
|
|
417
|
+
else:
|
|
418
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
419
|
+
return dt
|
|
420
|
+
except ValueError:
|
|
421
|
+
continue
|
|
422
|
+
|
|
423
|
+
raise ValueError(
|
|
424
|
+
f"Unable to parse timestamp '{timestamp_str}'. "
|
|
425
|
+
f"Supported formats: '08/06/25 03:54PM', '08/06/25', '03:54PM'"
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def show_status_table(
|
|
430
|
+
namespace: str,
|
|
431
|
+
all_users: bool,
|
|
432
|
+
limit: Optional[int] = None,
|
|
433
|
+
after: Optional[str] = None,
|
|
434
|
+
before: Optional[str] = None,
|
|
435
|
+
):
|
|
436
|
+
"""Compute cluster table values and display with optional filtering and pagination.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
namespace: Kubernetes namespace to search
|
|
440
|
+
all_users: Whether to show jobs from all users
|
|
441
|
+
limit: Maximum number of jobs to display
|
|
442
|
+
after: Show jobs created after this timestamp
|
|
443
|
+
before: Show jobs created before this timestamp
|
|
260
444
|
"""
|
|
261
445
|
# TODO(zhwu): Update the information for autostop clusters.
|
|
262
446
|
|
|
263
|
-
def _get_status_string_colorized(
|
|
447
|
+
def _get_status_string_colorized(
|
|
448
|
+
status: Dict[str, Any], job: Dict[str, Any]
|
|
449
|
+
) -> str:
|
|
450
|
+
# Handle case where status might be empty or missing
|
|
451
|
+
if not status:
|
|
452
|
+
return (
|
|
453
|
+
f'{colorama.Fore.YELLOW}'
|
|
454
|
+
f'{JobStatus.PENDING.name}{colorama.Style.RESET_ALL}'
|
|
455
|
+
)
|
|
456
|
+
|
|
264
457
|
terminalState = status.get('terminalState', None)
|
|
265
458
|
if terminalState and terminalState.upper() == JobStatus.COMPLETED.name.upper():
|
|
266
459
|
return (
|
|
@@ -272,16 +465,28 @@ def show_status_table(namespace: str, all_users: bool):
|
|
|
272
465
|
f'{colorama.Fore.RED}'
|
|
273
466
|
f'{JobStatus.FAILED.name}{colorama.Style.RESET_ALL}'
|
|
274
467
|
)
|
|
275
|
-
elif status
|
|
468
|
+
elif status.get('replicatedJobsStatus', [{}])[0].get('ready', False):
|
|
276
469
|
return (
|
|
277
470
|
f'{colorama.Fore.CYAN}'
|
|
278
471
|
f'{JobStatus.ACTIVE.name}{colorama.Style.RESET_ALL}'
|
|
279
472
|
)
|
|
280
|
-
elif status
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
473
|
+
elif status.get('replicatedJobsStatus', [{}])[0].get('suspended', False):
|
|
474
|
+
# Check if this was manually suspended
|
|
475
|
+
annotations = job.get('metadata', {}).get('annotations', {})
|
|
476
|
+
if annotations.get(backend_constants.STOP_USERID_LABEL):
|
|
477
|
+
username = annotations.get(
|
|
478
|
+
backend_constants.STOP_USERNAME_LABEL, 'unknown'
|
|
479
|
+
)
|
|
480
|
+
return (
|
|
481
|
+
f'{colorama.Fore.BLUE}'
|
|
482
|
+
f'{JobStatus.SUSPENDED.name} '
|
|
483
|
+
f'(by {username}){colorama.Style.RESET_ALL}'
|
|
484
|
+
)
|
|
485
|
+
else:
|
|
486
|
+
return (
|
|
487
|
+
f'{colorama.Fore.BLUE}'
|
|
488
|
+
f'{JobStatus.SUSPENDED.name} (by system){colorama.Style.RESET_ALL}'
|
|
489
|
+
)
|
|
285
490
|
else:
|
|
286
491
|
return (
|
|
287
492
|
f'{colorama.Fore.YELLOW}'
|
|
@@ -296,13 +501,32 @@ def show_status_table(namespace: str, all_users: bool):
|
|
|
296
501
|
|
|
297
502
|
days, remainder = divmod(total_seconds, 86400) # 86400 seconds in a day
|
|
298
503
|
hours, remainder = divmod(remainder, 3600) # 3600 seconds in an hour
|
|
299
|
-
minutes,
|
|
504
|
+
minutes, seconds = divmod(remainder, 60) # 60 seconds in a minute
|
|
505
|
+
|
|
506
|
+
days_str = f'{days} day{"s" if days != 1 else ""}, ' if days > 0 else ''
|
|
507
|
+
hours_str = f'{hours} hr{"s" if hours != 1 else ""}, ' if hours > 0 else ''
|
|
508
|
+
minutes_str = (
|
|
509
|
+
f'{minutes} min{"s" if minutes != 1 else ""}'
|
|
510
|
+
if minutes > 0 and days == 0
|
|
511
|
+
else ''
|
|
512
|
+
)
|
|
513
|
+
seconds_str = (
|
|
514
|
+
f'{seconds} sec{"s" if seconds != 1 else ""}'
|
|
515
|
+
if seconds > 0 and days == 0 and hours == 0 and minutes == 0
|
|
516
|
+
else ''
|
|
517
|
+
)
|
|
300
518
|
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
minutes_str = f'{minutes} minutes' if minutes > 0 else ''
|
|
519
|
+
result = f'{days_str}{hours_str}{minutes_str}{seconds_str}'
|
|
520
|
+
return result if result else '<1 minute', delta
|
|
304
521
|
|
|
305
|
-
|
|
522
|
+
def _format_timestamp(timestamp: str) -> str:
|
|
523
|
+
"""Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
|
|
524
|
+
# Parse UTC timestamp and convert to local time
|
|
525
|
+
dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
|
|
526
|
+
tzinfo=timezone.utc
|
|
527
|
+
)
|
|
528
|
+
dt_local = dt_utc.astimezone() # Convert to local timezone
|
|
529
|
+
return dt_local.strftime('%m/%d/%y %I:%M%p')
|
|
306
530
|
|
|
307
531
|
def _get_resources(job: Dict[str, Any]) -> str:
|
|
308
532
|
num_pods = int(
|
|
@@ -314,27 +538,86 @@ def show_status_table(namespace: str, all_users: bool):
|
|
|
314
538
|
cpu, memory = resources['cpu'], resources['memory']
|
|
315
539
|
accelerator = job['metadata']['labels'].get(JOBSET_ACCELERATOR_LABEL, None)
|
|
316
540
|
if accelerator:
|
|
317
|
-
return f'{num_pods}x({cpu}CPU,
|
|
541
|
+
return f'{num_pods}x({cpu}CPU, {memory}MEM, {accelerator})'
|
|
318
542
|
else:
|
|
319
|
-
return f'{num_pods}x({cpu}CPU,
|
|
543
|
+
return f'{num_pods}x({cpu}CPU, {memory}MEM)'
|
|
320
544
|
|
|
321
545
|
if all_users:
|
|
322
|
-
columns = [
|
|
546
|
+
columns = [
|
|
547
|
+
'NAME',
|
|
548
|
+
'USER',
|
|
549
|
+
'STATUS',
|
|
550
|
+
'RESOURCES',
|
|
551
|
+
'SUBMITTED',
|
|
552
|
+
'START TIME',
|
|
553
|
+
'END TIME',
|
|
554
|
+
]
|
|
323
555
|
else:
|
|
324
|
-
columns = ['NAME', 'STATUS', 'RESOURCES', 'SUBMITTED']
|
|
556
|
+
columns = ['NAME', 'STATUS', 'RESOURCES', 'SUBMITTED', 'START TIME', 'END TIME']
|
|
325
557
|
job_table = log_utils.create_table(columns)
|
|
326
558
|
job_specs = list_jobset(namespace)
|
|
327
559
|
assert job_specs is not None, 'Retrieving jobs failed'
|
|
560
|
+
|
|
561
|
+
# Parse timestamp filters if provided
|
|
562
|
+
after_dt = None
|
|
563
|
+
before_dt = None
|
|
564
|
+
if after:
|
|
565
|
+
try:
|
|
566
|
+
after_dt = _parse_timestamp_filter(after)
|
|
567
|
+
except ValueError as e:
|
|
568
|
+
click.secho(f'Error parsing --after timestamp: {e}', fg='red', err=True)
|
|
569
|
+
return
|
|
570
|
+
if before:
|
|
571
|
+
try:
|
|
572
|
+
before_dt = _parse_timestamp_filter(before)
|
|
573
|
+
except ValueError as e:
|
|
574
|
+
click.secho(f'Error parsing --before timestamp: {e}', fg='red', err=True)
|
|
575
|
+
return
|
|
576
|
+
|
|
328
577
|
rows = []
|
|
329
578
|
for job in job_specs['items']:
|
|
579
|
+
# Apply timestamp filtering
|
|
580
|
+
if after_dt or before_dt:
|
|
581
|
+
job_creation_time = datetime.strptime(
|
|
582
|
+
job['metadata']['creationTimestamp'], '%Y-%m-%dT%H:%M:%SZ'
|
|
583
|
+
).replace(tzinfo=timezone.utc)
|
|
584
|
+
|
|
585
|
+
if after_dt and job_creation_time <= after_dt:
|
|
586
|
+
continue
|
|
587
|
+
if before_dt and job_creation_time >= before_dt:
|
|
588
|
+
continue
|
|
589
|
+
# Get start time
|
|
590
|
+
start_time = _format_timestamp(job['metadata']['creationTimestamp'])
|
|
591
|
+
|
|
592
|
+
# Get submitted time (how long ago)
|
|
593
|
+
submitted_time, _ = _get_time_delta(job['metadata']['creationTimestamp'])
|
|
594
|
+
|
|
595
|
+
# Get end time (from JobSet conditions)
|
|
596
|
+
def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
|
|
597
|
+
"""Extract end time from JobSet conditions (Completed or Failed)"""
|
|
598
|
+
conditions = job.get('status', {}).get('conditions', [])
|
|
599
|
+
for condition in conditions:
|
|
600
|
+
# Look for terminal conditions with status=True
|
|
601
|
+
if (
|
|
602
|
+
condition.get('type') in ['Completed', 'Failed']
|
|
603
|
+
and condition.get('status') == 'True'
|
|
604
|
+
):
|
|
605
|
+
return _format_timestamp(condition.get('lastTransitionTime', ''))
|
|
606
|
+
return '-'
|
|
607
|
+
|
|
608
|
+
end_time = _get_end_time_from_conditions(job)
|
|
609
|
+
|
|
330
610
|
if all_users:
|
|
331
611
|
rows.append(
|
|
332
612
|
[
|
|
333
613
|
job['metadata']['name'],
|
|
334
614
|
job['metadata']['labels'][JOBSET_USERID_LABEL],
|
|
335
|
-
_get_status_string_colorized(job
|
|
615
|
+
_get_status_string_colorized(job.get('status', {}), job),
|
|
336
616
|
_get_resources(job),
|
|
337
|
-
|
|
617
|
+
submitted_time,
|
|
618
|
+
start_time,
|
|
619
|
+
end_time,
|
|
620
|
+
job['metadata']['creationTimestamp'],
|
|
338
621
|
]
|
|
339
622
|
)
|
|
340
623
|
elif (
|
|
@@ -345,13 +628,41 @@ def show_status_table(namespace: str, all_users: bool):
|
|
|
345
628
|
rows.append(
|
|
346
629
|
[
|
|
347
630
|
job['metadata']['name'],
|
|
348
|
-
_get_status_string_colorized(job.get('status', {})),
|
|
631
|
+
_get_status_string_colorized(job.get('status', {}), job),
|
|
349
632
|
_get_resources(job),
|
|
350
|
-
|
|
633
|
+
submitted_time,
|
|
634
|
+
start_time,
|
|
635
|
+
end_time,
|
|
636
|
+
job['metadata']['creationTimestamp'],
|
|
351
637
|
]
|
|
352
638
|
)
|
|
353
|
-
|
|
354
|
-
#
|
|
639
|
+
|
|
640
|
+
# Sort by creation timestamp (most recent first)
|
|
641
|
+
rows = sorted(rows, key=lambda x: x[-1], reverse=True)
|
|
642
|
+
|
|
643
|
+
# Apply limit if specified
|
|
644
|
+
if limit and limit > 0:
|
|
645
|
+
rows = rows[:limit]
|
|
646
|
+
|
|
647
|
+
# Show pagination info if applicable
|
|
648
|
+
total_jobs = len(job_specs['items'])
|
|
649
|
+
filtered_jobs = len(rows)
|
|
650
|
+
|
|
651
|
+
if limit or after or before:
|
|
652
|
+
filter_info = []
|
|
653
|
+
if after:
|
|
654
|
+
filter_info.append(f'after {after}')
|
|
655
|
+
if before:
|
|
656
|
+
filter_info.append(f'before {before}')
|
|
657
|
+
if limit:
|
|
658
|
+
filter_info.append(f'limit {limit}')
|
|
659
|
+
|
|
660
|
+
filter_str = ', '.join(filter_info)
|
|
661
|
+
click.secho(f'Showing {filtered_jobs} jobs ({filter_str})', fg='yellow')
|
|
662
|
+
if total_jobs != filtered_jobs:
|
|
663
|
+
click.secho(f'Total jobs in namespace: {total_jobs}', fg='yellow')
|
|
664
|
+
|
|
665
|
+
# Remove the sorting timestamp and add rows to table
|
|
355
666
|
for row in rows:
|
|
356
|
-
job_table.add_row(row)
|
|
667
|
+
job_table.add_row(row[:-1])
|
|
357
668
|
print(job_table)
|
konduktor/backends/pod_utils.py
CHANGED
|
@@ -154,7 +154,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
|
154
154
|
default_secrets = []
|
|
155
155
|
|
|
156
156
|
user_hash = common_utils.get_user_hash()
|
|
157
|
-
label_selector = f'
|
|
157
|
+
label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
|
|
158
158
|
user_secrets = kubernetes_utils.list_secrets(
|
|
159
159
|
namespace, context, label_filter=label_selector
|
|
160
160
|
)
|
konduktor/cli.py
CHANGED
|
@@ -45,12 +45,13 @@ import click
|
|
|
45
45
|
import colorama
|
|
46
46
|
import dotenv
|
|
47
47
|
import prettytable
|
|
48
|
-
import yaml
|
|
48
|
+
import yaml # type: ignore
|
|
49
49
|
from rich.progress import track
|
|
50
50
|
|
|
51
51
|
import konduktor
|
|
52
52
|
from konduktor import check as konduktor_check
|
|
53
53
|
from konduktor import logging
|
|
54
|
+
from konduktor.backends import constants as backend_constants
|
|
54
55
|
from konduktor.backends import deployment_utils, jobset_utils
|
|
55
56
|
from konduktor.utils import (
|
|
56
57
|
common_utils,
|
|
@@ -606,21 +607,67 @@ def cli():
|
|
|
606
607
|
required=False,
|
|
607
608
|
help='Show all clusters, including those not owned by the ' 'current user.',
|
|
608
609
|
)
|
|
610
|
+
@click.option(
|
|
611
|
+
'--limit',
|
|
612
|
+
'-l',
|
|
613
|
+
default=None,
|
|
614
|
+
type=int,
|
|
615
|
+
help='Maximum number of jobs to display (e.g., --limit 100)',
|
|
616
|
+
)
|
|
617
|
+
@click.option(
|
|
618
|
+
'--after',
|
|
619
|
+
default=None,
|
|
620
|
+
type=str,
|
|
621
|
+
help=(
|
|
622
|
+
'Show jobs created after this timestamp '
|
|
623
|
+
'(e.g., --after "08/06/25 03:54PM", --after "08/06/25", --after "03:54PM")'
|
|
624
|
+
),
|
|
625
|
+
)
|
|
626
|
+
@click.option(
|
|
627
|
+
'--before',
|
|
628
|
+
default=None,
|
|
629
|
+
type=str,
|
|
630
|
+
help=(
|
|
631
|
+
'Show jobs created before this timestamp '
|
|
632
|
+
'(e.g., --before "08/06/25 03:54PM", --before "08/06/25", --before "03:54PM")'
|
|
633
|
+
),
|
|
634
|
+
)
|
|
609
635
|
# pylint: disable=redefined-builtin
|
|
610
|
-
def status(
|
|
636
|
+
def status(
|
|
637
|
+
all_users: bool, limit: Optional[int], after: Optional[str], before: Optional[str]
|
|
638
|
+
):
|
|
611
639
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
612
|
-
"""Shows list of all the jobs
|
|
640
|
+
"""Shows list of all the jobs with optional filtering and pagination
|
|
613
641
|
|
|
614
642
|
Args:
|
|
615
|
-
all_users (bool): whether to show all jobs
|
|
616
|
-
|
|
643
|
+
all_users (bool): whether to show all jobs for all users
|
|
644
|
+
limit (Optional[int]): maximum number of jobs to display
|
|
645
|
+
after (Optional[str]): show jobs created after this timestamp
|
|
646
|
+
before (Optional[str]): show jobs created before this timestamp
|
|
647
|
+
|
|
648
|
+
Examples:
|
|
649
|
+
konduktor status --limit 10
|
|
650
|
+
konduktor status --before "08/06/25 03:53PM"
|
|
651
|
+
konduktor status --all-users --limit 10 --after "08/06/25 03:53PM"
|
|
652
|
+
|
|
653
|
+
Note:
|
|
654
|
+
When using --before or --after timestamps, passing in "08/06/25" is
|
|
655
|
+
equivalent to passing in "08/06/25 00:00".
|
|
656
|
+
When using --before or --after timestamps, passing in "03:53PM" is
|
|
657
|
+
equivalent to passing in "03:53:00PM".
|
|
658
|
+
Timestamps shown in "konduktor startus" are truncated and are in the
|
|
659
|
+
local timezone. ex. "03:53:55PM" --> "03:53PM"
|
|
660
|
+
and would show up in --after "03:53PM" but not in --before "03:53PM"
|
|
661
|
+
despite status showing as "03:53PM".
|
|
617
662
|
"""
|
|
618
663
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
619
664
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
620
665
|
user = common_utils.user_and_hostname_hash() if not all_users else 'All'
|
|
621
666
|
click.secho(f'User: {user}', fg='green', bold=True)
|
|
622
667
|
click.secho('Jobs', fg='cyan', bold=True)
|
|
623
|
-
jobset_utils.show_status_table(
|
|
668
|
+
jobset_utils.show_status_table(
|
|
669
|
+
namespace, all_users=all_users, limit=limit, after=after, before=before
|
|
670
|
+
)
|
|
624
671
|
|
|
625
672
|
|
|
626
673
|
@cli.command()
|
|
@@ -959,7 +1006,294 @@ def down(
|
|
|
959
1006
|
|
|
960
1007
|
|
|
961
1008
|
@cli.command(cls=_DocumentedCodeCommand)
|
|
962
|
-
@click.argument(
|
|
1009
|
+
@click.argument(
|
|
1010
|
+
'jobs',
|
|
1011
|
+
nargs=-1,
|
|
1012
|
+
required=False,
|
|
1013
|
+
)
|
|
1014
|
+
@click.option('--all', '-a', default=None, is_flag=True, help='Suspend all jobs.')
|
|
1015
|
+
@click.option(
|
|
1016
|
+
'--all-users',
|
|
1017
|
+
'--all_users',
|
|
1018
|
+
default=False,
|
|
1019
|
+
is_flag=True,
|
|
1020
|
+
help='Include other users for suspension',
|
|
1021
|
+
)
|
|
1022
|
+
@click.option(
|
|
1023
|
+
'--yes',
|
|
1024
|
+
'-y',
|
|
1025
|
+
is_flag=True,
|
|
1026
|
+
default=False,
|
|
1027
|
+
required=False,
|
|
1028
|
+
help='Skip confirmation prompt.',
|
|
1029
|
+
)
|
|
1030
|
+
def stop(
|
|
1031
|
+
jobs: List[str],
|
|
1032
|
+
all: Optional[bool],
|
|
1033
|
+
all_users: Optional[bool],
|
|
1034
|
+
yes: bool,
|
|
1035
|
+
):
|
|
1036
|
+
"""Suspend job(s) (manual/user-initiated).
|
|
1037
|
+
|
|
1038
|
+
JOB is the name of the job to suspend. If both
|
|
1039
|
+
JOB and ``--all`` are supplied, the latter takes precedence.
|
|
1040
|
+
|
|
1041
|
+
Suspending a job will pause execution and mark the job as SUSPENDED (by user).
|
|
1042
|
+
The job can be resumed later with `konduktor start`.
|
|
1043
|
+
|
|
1044
|
+
If a job is suspended by the system (e.g., due to queueing),
|
|
1045
|
+
it will show as SUSPENDED (by system).
|
|
1046
|
+
|
|
1047
|
+
Wildcard patterns are supported using * characters.
|
|
1048
|
+
Examples: "my_job-*" matches all jobs starting with "my_job-",
|
|
1049
|
+
"*-gpu" matches all jobs ending with "-gpu".
|
|
1050
|
+
|
|
1051
|
+
Examples:
|
|
1052
|
+
|
|
1053
|
+
.. code-block:: bash
|
|
1054
|
+
|
|
1055
|
+
# Suspend a specific job.
|
|
1056
|
+
konduktor stop my_job
|
|
1057
|
+
\b
|
|
1058
|
+
# Suspend multiple jobs.
|
|
1059
|
+
konduktor stop my_job1 my_job2
|
|
1060
|
+
\b
|
|
1061
|
+
# Suspend all jobs matching a pattern.
|
|
1062
|
+
konduktor stop "my_job-*"
|
|
1063
|
+
\b
|
|
1064
|
+
# Suspend all of this users jobs.
|
|
1065
|
+
konduktor stop -a
|
|
1066
|
+
konduktor stop --all
|
|
1067
|
+
|
|
1068
|
+
# Suspend all jobs across all users
|
|
1069
|
+
konduktor stop --all --all-users
|
|
1070
|
+
|
|
1071
|
+
"""
|
|
1072
|
+
|
|
1073
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1074
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1075
|
+
jobs_response = jobset_utils.list_jobset(namespace)
|
|
1076
|
+
assert jobs_response
|
|
1077
|
+
jobs_specs = [
|
|
1078
|
+
job
|
|
1079
|
+
for job in jobs_response['items']
|
|
1080
|
+
if (
|
|
1081
|
+
job['metadata']['labels'][jobset_utils.JOBSET_USERID_LABEL]
|
|
1082
|
+
== common_utils.user_and_hostname_hash()
|
|
1083
|
+
and not all_users
|
|
1084
|
+
)
|
|
1085
|
+
]
|
|
1086
|
+
|
|
1087
|
+
if all:
|
|
1088
|
+
assert jobs_specs is not None, f'No jobs found in namespace {namespace}'
|
|
1089
|
+
assert len(jobs_specs) > 0, f'No jobs found in namespace {namespace}'
|
|
1090
|
+
jobs = [job['metadata']['name'] for job in jobs_specs]
|
|
1091
|
+
elif jobs:
|
|
1092
|
+
# Get all available jobs to match against patterns
|
|
1093
|
+
if len(jobs_specs) == 0:
|
|
1094
|
+
raise click.ClickException(f'No jobs found in namespace {namespace}')
|
|
1095
|
+
|
|
1096
|
+
all_job_names = [job['metadata']['name'] for job in jobs_specs]
|
|
1097
|
+
matched_jobs = []
|
|
1098
|
+
|
|
1099
|
+
for job_pattern in jobs:
|
|
1100
|
+
# Use fnmatch for both wildcard and exact pattern matching
|
|
1101
|
+
pattern_matches = fnmatch.filter(all_job_names, job_pattern)
|
|
1102
|
+
if not pattern_matches:
|
|
1103
|
+
click.secho(
|
|
1104
|
+
f'Warning: No jobs found matching pattern "{job_pattern}"',
|
|
1105
|
+
fg='yellow',
|
|
1106
|
+
err=True,
|
|
1107
|
+
)
|
|
1108
|
+
matched_jobs.extend(pattern_matches)
|
|
1109
|
+
|
|
1110
|
+
# Remove duplicates while preserving order
|
|
1111
|
+
seen = set()
|
|
1112
|
+
jobs = []
|
|
1113
|
+
for job in matched_jobs:
|
|
1114
|
+
if job not in seen:
|
|
1115
|
+
seen.add(job)
|
|
1116
|
+
jobs.append(job)
|
|
1117
|
+
|
|
1118
|
+
if not jobs:
|
|
1119
|
+
raise click.ClickException(
|
|
1120
|
+
f'No matching jobs found check status with '
|
|
1121
|
+
f'{colorama.Style.BRIGHT}konduktor status{colorama.Style.RESET_ALL}'
|
|
1122
|
+
)
|
|
1123
|
+
else:
|
|
1124
|
+
raise click.ClickException(
|
|
1125
|
+
'No jobs specified. Use --all to suspend '
|
|
1126
|
+
'all jobs or specify job names/patterns.'
|
|
1127
|
+
)
|
|
1128
|
+
|
|
1129
|
+
if not yes:
|
|
1130
|
+
# Prompt for confirmation
|
|
1131
|
+
prompt = (
|
|
1132
|
+
f'Suspending job(s) {colorama.Style.BRIGHT} '
|
|
1133
|
+
f'{colorama.Fore.GREEN}{jobs}{colorama.Style.RESET_ALL}. '
|
|
1134
|
+
'Proceed?'
|
|
1135
|
+
)
|
|
1136
|
+
if prompt is not None:
|
|
1137
|
+
click.confirm(prompt, default=True, abort=True, show_default=True)
|
|
1138
|
+
|
|
1139
|
+
for job in track(jobs, description='Suspending job(s)...'):
|
|
1140
|
+
jobset_utils.stop_jobset(namespace, job)
|
|
1141
|
+
|
|
1142
|
+
|
|
1143
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
|
1144
|
+
@click.argument(
|
|
1145
|
+
'jobs',
|
|
1146
|
+
nargs=-1,
|
|
1147
|
+
required=False,
|
|
1148
|
+
)
|
|
1149
|
+
@click.option(
|
|
1150
|
+
'--all', '-a', default=None, is_flag=True, help='Resume all suspended jobs.'
|
|
1151
|
+
)
|
|
1152
|
+
@click.option(
|
|
1153
|
+
'--all-users',
|
|
1154
|
+
'--all_users',
|
|
1155
|
+
default=False,
|
|
1156
|
+
is_flag=True,
|
|
1157
|
+
help='Include other users for resumption',
|
|
1158
|
+
)
|
|
1159
|
+
@click.option(
|
|
1160
|
+
'--yes',
|
|
1161
|
+
'-y',
|
|
1162
|
+
is_flag=True,
|
|
1163
|
+
default=False,
|
|
1164
|
+
required=False,
|
|
1165
|
+
help='Skip confirmation prompt.',
|
|
1166
|
+
)
|
|
1167
|
+
def start(
|
|
1168
|
+
jobs: List[str],
|
|
1169
|
+
all: Optional[bool],
|
|
1170
|
+
all_users: Optional[bool],
|
|
1171
|
+
yes: bool,
|
|
1172
|
+
):
|
|
1173
|
+
"""Resume suspended job(s) (manual/user-initiated).
|
|
1174
|
+
|
|
1175
|
+
JOB is the name of the job to resume. If both
|
|
1176
|
+
JOB and ``--all`` are supplied, the latter takes precedence.
|
|
1177
|
+
|
|
1178
|
+
Resuming a job will restart execution from where it was suspended.
|
|
1179
|
+
Only suspended jobs can be resumed.
|
|
1180
|
+
|
|
1181
|
+
This command works for both manually suspended jobs (SUSPENDED by user)
|
|
1182
|
+
and system-suspended jobs (SUSPENDED by system).
|
|
1183
|
+
|
|
1184
|
+
Wildcard patterns are supported using * characters.
|
|
1185
|
+
Examples: "my_job-*" matches all jobs starting with "my_job-",
|
|
1186
|
+
"*-gpu" matches all jobs ending with "-gpu".
|
|
1187
|
+
|
|
1188
|
+
Examples:
|
|
1189
|
+
|
|
1190
|
+
.. code-block:: bash
|
|
1191
|
+
|
|
1192
|
+
# Resume a specific job.
|
|
1193
|
+
konduktor start my_job
|
|
1194
|
+
\b
|
|
1195
|
+
# Resume multiple jobs.
|
|
1196
|
+
konduktor start my_job1 my_job2
|
|
1197
|
+
\b
|
|
1198
|
+
# Resume all jobs matching a pattern.
|
|
1199
|
+
konduktor start "my_job-*"
|
|
1200
|
+
\b
|
|
1201
|
+
# Resume all of this users suspended jobs.
|
|
1202
|
+
konduktor start -a
|
|
1203
|
+
konduktor start --all
|
|
1204
|
+
|
|
1205
|
+
# Resume all suspended jobs across all users
|
|
1206
|
+
konduktor start --all --all-users
|
|
1207
|
+
|
|
1208
|
+
"""
|
|
1209
|
+
|
|
1210
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
1211
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1212
|
+
jobs_response = jobset_utils.list_jobset(namespace)
|
|
1213
|
+
assert jobs_response
|
|
1214
|
+
jobs_specs = [
|
|
1215
|
+
job
|
|
1216
|
+
for job in jobs_response['items']
|
|
1217
|
+
if (
|
|
1218
|
+
job['metadata']['labels'][jobset_utils.JOBSET_USERID_LABEL]
|
|
1219
|
+
== common_utils.user_and_hostname_hash()
|
|
1220
|
+
and not all_users
|
|
1221
|
+
)
|
|
1222
|
+
]
|
|
1223
|
+
|
|
1224
|
+
if all:
|
|
1225
|
+
# Only get suspended jobs when using --all
|
|
1226
|
+
suspended_jobs = [
|
|
1227
|
+
job['metadata']['name']
|
|
1228
|
+
for job in jobs_specs
|
|
1229
|
+
if job.get('status', {})
|
|
1230
|
+
.get('replicatedJobsStatus', [{}])[0]
|
|
1231
|
+
.get('suspended', False)
|
|
1232
|
+
]
|
|
1233
|
+
if not suspended_jobs:
|
|
1234
|
+
raise click.ClickException(
|
|
1235
|
+
f'No suspended jobs found in namespace {namespace}'
|
|
1236
|
+
)
|
|
1237
|
+
jobs = suspended_jobs
|
|
1238
|
+
elif jobs:
|
|
1239
|
+
# Get all available jobs to match against patterns
|
|
1240
|
+
if len(jobs_specs) == 0:
|
|
1241
|
+
raise click.ClickException(f'No jobs found in namespace {namespace}')
|
|
1242
|
+
|
|
1243
|
+
all_job_names = [job['metadata']['name'] for job in jobs_specs]
|
|
1244
|
+
matched_jobs = []
|
|
1245
|
+
|
|
1246
|
+
for job_pattern in jobs:
|
|
1247
|
+
# Use fnmatch for both wildcard and exact pattern matching
|
|
1248
|
+
pattern_matches = fnmatch.filter(all_job_names, job_pattern)
|
|
1249
|
+
if not pattern_matches:
|
|
1250
|
+
click.secho(
|
|
1251
|
+
f'Warning: No jobs found matching pattern "{job_pattern}"',
|
|
1252
|
+
fg='yellow',
|
|
1253
|
+
err=True,
|
|
1254
|
+
)
|
|
1255
|
+
matched_jobs.extend(pattern_matches)
|
|
1256
|
+
|
|
1257
|
+
# Remove duplicates while preserving order
|
|
1258
|
+
seen = set()
|
|
1259
|
+
jobs = []
|
|
1260
|
+
for job in matched_jobs:
|
|
1261
|
+
if job not in seen:
|
|
1262
|
+
seen.add(job)
|
|
1263
|
+
jobs.append(job)
|
|
1264
|
+
|
|
1265
|
+
if not jobs:
|
|
1266
|
+
raise click.ClickException(
|
|
1267
|
+
f'No matching jobs found check status with '
|
|
1268
|
+
f'{colorama.Style.BRIGHT}konduktor status{colorama.Style.RESET_ALL}'
|
|
1269
|
+
)
|
|
1270
|
+
else:
|
|
1271
|
+
raise click.ClickException(
|
|
1272
|
+
'No jobs specified. Use --all to resume '
|
|
1273
|
+
'all suspended jobs or specify job names/patterns.'
|
|
1274
|
+
)
|
|
1275
|
+
|
|
1276
|
+
if not yes:
|
|
1277
|
+
# Prompt for confirmation
|
|
1278
|
+
prompt = (
|
|
1279
|
+
f'Resuming job(s) {colorama.Style.BRIGHT} '
|
|
1280
|
+
f'{colorama.Fore.GREEN}{jobs}{colorama.Style.RESET_ALL}. '
|
|
1281
|
+
'Proceed?'
|
|
1282
|
+
)
|
|
1283
|
+
if prompt is not None:
|
|
1284
|
+
click.confirm(prompt, default=True, abort=True, show_default=True)
|
|
1285
|
+
|
|
1286
|
+
for job in track(jobs, description='Resuming job(s)...'):
|
|
1287
|
+
jobset_utils.start_jobset(namespace, job)
|
|
1288
|
+
|
|
1289
|
+
|
|
1290
|
+
@cli.command(cls=_DocumentedCodeCommand)
|
|
1291
|
+
@click.argument(
|
|
1292
|
+
'clouds',
|
|
1293
|
+
required=True,
|
|
1294
|
+
type=str,
|
|
1295
|
+
nargs=-1,
|
|
1296
|
+
)
|
|
963
1297
|
def check(clouds: Tuple[str]):
|
|
964
1298
|
"""Check which clouds are available to use for storage
|
|
965
1299
|
|
|
@@ -1143,9 +1477,9 @@ def create(kind, from_file, from_directory, inline, name):
|
|
|
1143
1477
|
'name': secret_name,
|
|
1144
1478
|
'labels': {
|
|
1145
1479
|
'parent': 'konduktor',
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1480
|
+
backend_constants.SECRET_OWNER_LABEL: common_utils.get_user_hash(),
|
|
1481
|
+
backend_constants.SECRET_BASENAME_LABEL: basename,
|
|
1482
|
+
backend_constants.SECRET_KIND_LABEL: kind or None,
|
|
1149
1483
|
},
|
|
1150
1484
|
}
|
|
1151
1485
|
|
|
@@ -1153,13 +1487,13 @@ def create(kind, from_file, from_directory, inline, name):
|
|
|
1153
1487
|
# Overwrites if user trying to create more than 1
|
|
1154
1488
|
if kind == 'git-ssh':
|
|
1155
1489
|
user_hash = common_utils.get_user_hash()
|
|
1156
|
-
label_selector = f'
|
|
1490
|
+
label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
|
|
1157
1491
|
existing = kubernetes_utils.list_secrets(
|
|
1158
1492
|
namespace, context, label_filter=label_selector
|
|
1159
1493
|
)
|
|
1160
1494
|
for s in existing:
|
|
1161
1495
|
labels = s.metadata.labels or {}
|
|
1162
|
-
if labels.get(
|
|
1496
|
+
if labels.get(backend_constants.SECRET_KIND_LABEL) == 'git-ssh':
|
|
1163
1497
|
old_name = s.metadata.name
|
|
1164
1498
|
click.echo(f'Found existing git-ssh secret: {old_name}, deleting it.')
|
|
1165
1499
|
kubernetes_utils.delete_secret(
|
|
@@ -1188,7 +1522,7 @@ def delete(name):
|
|
|
1188
1522
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
1189
1523
|
user_hash = common_utils.get_user_hash()
|
|
1190
1524
|
|
|
1191
|
-
label_selector = f'
|
|
1525
|
+
label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
|
|
1192
1526
|
secrets = kubernetes_utils.list_secrets(
|
|
1193
1527
|
namespace, context, label_filter=label_selector
|
|
1194
1528
|
)
|
|
@@ -1196,7 +1530,8 @@ def delete(name):
|
|
|
1196
1530
|
matches = [
|
|
1197
1531
|
s
|
|
1198
1532
|
for s in secrets
|
|
1199
|
-
if s.metadata.labels
|
|
1533
|
+
if s.metadata.labels
|
|
1534
|
+
and s.metadata.labels.get(backend_constants.SECRET_BASENAME_LABEL) == name
|
|
1200
1535
|
]
|
|
1201
1536
|
|
|
1202
1537
|
if not matches:
|
|
@@ -1233,7 +1568,7 @@ def list_secrets(all_users: bool):
|
|
|
1233
1568
|
if not all_users:
|
|
1234
1569
|
user_hash = common_utils.get_user_hash()
|
|
1235
1570
|
username = common_utils.get_cleaned_username()
|
|
1236
|
-
label_selector = f'
|
|
1571
|
+
label_selector = f'{backend_constants.SECRET_OWNER_LABEL}={user_hash}'
|
|
1237
1572
|
secrets = kubernetes_utils.list_secrets(
|
|
1238
1573
|
namespace, context, label_filter=label_selector
|
|
1239
1574
|
)
|
|
@@ -1254,9 +1589,9 @@ def list_secrets(all_users: bool):
|
|
|
1254
1589
|
|
|
1255
1590
|
for s in secrets:
|
|
1256
1591
|
labels = s.metadata.labels or {}
|
|
1257
|
-
basename = labels.get(
|
|
1258
|
-
kind = labels.get(
|
|
1259
|
-
owner = labels.get(
|
|
1592
|
+
basename = labels.get(backend_constants.SECRET_BASENAME_LABEL, s.metadata.name)
|
|
1593
|
+
kind = labels.get(backend_constants.SECRET_KIND_LABEL, '(none)')
|
|
1594
|
+
owner = labels.get(backend_constants.SECRET_OWNER_LABEL, '(none)')
|
|
1260
1595
|
|
|
1261
1596
|
if all_users:
|
|
1262
1597
|
click.echo(f'{basename:30} kind={kind:10} owner={owner}')
|
konduktor/config.py
CHANGED
konduktor/data/aws/s3.py
CHANGED
|
@@ -29,6 +29,7 @@ import colorama
|
|
|
29
29
|
from konduktor import config, logging
|
|
30
30
|
from konduktor.adaptors import aws
|
|
31
31
|
from konduktor.adaptors.aws import boto3
|
|
32
|
+
from konduktor.backends import constants as backend_constants
|
|
32
33
|
from konduktor.data import constants, data_utils, storage_utils
|
|
33
34
|
from konduktor.utils import (
|
|
34
35
|
annotations,
|
|
@@ -1036,7 +1037,7 @@ class S3Store(storage_utils.AbstractStore):
|
|
|
1036
1037
|
|
|
1037
1038
|
secret_metadata = {
|
|
1038
1039
|
'labels': {
|
|
1039
|
-
|
|
1040
|
+
backend_constants.SECRET_KIND_LABEL: 'S3',
|
|
1040
1041
|
},
|
|
1041
1042
|
}
|
|
1042
1043
|
|
konduktor/data/gcp/gcs.py
CHANGED
|
@@ -28,6 +28,7 @@ if typing.TYPE_CHECKING:
|
|
|
28
28
|
|
|
29
29
|
from konduktor import logging
|
|
30
30
|
from konduktor.adaptors import gcp
|
|
31
|
+
from konduktor.backends import constants as backend_constants
|
|
31
32
|
from konduktor.data import constants, data_utils, storage_utils
|
|
32
33
|
from konduktor.data.gcp import utils
|
|
33
34
|
from konduktor.utils import (
|
|
@@ -886,7 +887,7 @@ class GcsStore(storage_utils.AbstractStore):
|
|
|
886
887
|
|
|
887
888
|
secret_metadata = {
|
|
888
889
|
'labels': {
|
|
889
|
-
|
|
890
|
+
backend_constants.SECRET_KIND_LABEL: 'GCS',
|
|
890
891
|
},
|
|
891
892
|
}
|
|
892
893
|
|
konduktor/task.py
CHANGED
konduktor/utils/common_utils.py
CHANGED
|
@@ -21,9 +21,10 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
21
21
|
|
|
22
22
|
import filelock
|
|
23
23
|
import kubernetes
|
|
24
|
-
import yaml
|
|
24
|
+
import yaml # type: ignore
|
|
25
25
|
|
|
26
26
|
from konduktor import config, kube_client, logging
|
|
27
|
+
from konduktor.backends import constants as backend_constants
|
|
27
28
|
from konduktor.utils import common_utils, kubernetes_enums
|
|
28
29
|
|
|
29
30
|
if typing.TYPE_CHECKING:
|
|
@@ -604,8 +605,8 @@ def set_secret(
|
|
|
604
605
|
'name': full_name,
|
|
605
606
|
'labels': {
|
|
606
607
|
'parent': 'konduktor',
|
|
607
|
-
|
|
608
|
-
|
|
608
|
+
backend_constants.SECRET_OWNER_LABEL: user_hash,
|
|
609
|
+
backend_constants.SECRET_BASENAME_LABEL: secret_name,
|
|
609
610
|
},
|
|
610
611
|
}
|
|
611
612
|
|
|
@@ -680,7 +681,7 @@ def delete_secret(
|
|
|
680
681
|
def get_secret_kind(secret: kubernetes.client.V1Secret) -> Optional[str]:
|
|
681
682
|
"""Get the konduktor-specific kind of a secret, if labeled."""
|
|
682
683
|
if secret.metadata.labels:
|
|
683
|
-
return secret.metadata.labels.get(
|
|
684
|
+
return secret.metadata.labels.get(backend_constants.SECRET_KIND_LABEL)
|
|
684
685
|
return None
|
|
685
686
|
|
|
686
687
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
konduktor/__init__.py,sha256=
|
|
1
|
+
konduktor/__init__.py,sha256=720Cfjqpm9rheGo2kD7zlSuu5vkqCs9jlmvXtt7kNLg,1574
|
|
2
2
|
konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
|
|
4
4
|
konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
|
|
@@ -6,15 +6,15 @@ konduktor/adaptors/gcp.py,sha256=ierTF4z7vwpJ9BsC7LSiwv4uLcjGXscwZOwQrddr2vM,410
|
|
|
6
6
|
konduktor/authentication.py,sha256=_mVy3eqoKohicHostFiGwG1-2ybxP-l7ouofQ0LRlCY,4570
|
|
7
7
|
konduktor/backends/__init__.py,sha256=usWJ8HdZJEyg7MIsN8Zcz9rk9e2Lq5dWJ8dv6hCN3ys,199
|
|
8
8
|
konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
|
|
9
|
-
konduktor/backends/constants.py,sha256=
|
|
9
|
+
konduktor/backends/constants.py,sha256=NfdhY1PQnewvDCjgRKXj6EZDcVH8k_0GGxnMo7w6HDU,666
|
|
10
10
|
konduktor/backends/deployment.py,sha256=EHfB2uLeKFQ3maek9tx6XL4_sjQ-ax59DZA79Q3EkVs,5519
|
|
11
11
|
konduktor/backends/deployment_utils.py,sha256=VGuL01rKe7p7PoVRI_cP4tiZRxHZ13nnTMG-bmDf7P0,28975
|
|
12
12
|
konduktor/backends/jobset.py,sha256=OwgDog9nH-FoUmNU_H--C3U5jx70reTKL1l849M1k5A,8430
|
|
13
|
-
konduktor/backends/jobset_utils.py,sha256=
|
|
14
|
-
konduktor/backends/pod_utils.py,sha256=
|
|
13
|
+
konduktor/backends/jobset_utils.py,sha256=YPbGxbM9FIPNLlvu3_189iGDopWrGLqL_kJO17McRUU,24567
|
|
14
|
+
konduktor/backends/pod_utils.py,sha256=Jfv_CY8suF0e7QEaeQiNRRxRnOueLgPR8SfLEO7lnwc,15260
|
|
15
15
|
konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
|
|
16
|
-
konduktor/cli.py,sha256=
|
|
17
|
-
konduktor/config.py,sha256=
|
|
16
|
+
konduktor/cli.py,sha256=lEZmfrswuxMAyU5hmndMHqk4GkJyohk_TOHBx-0h90M,56316
|
|
17
|
+
konduktor/config.py,sha256=9upqgCCYvcu6fKw7tovEYC1MWTkAAir0_WHPdayylbI,15536
|
|
18
18
|
konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
|
|
19
19
|
konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
20
|
konduktor/controller/constants.py,sha256=SGAgu9yTDWYXyVwxlaw1vfRJFOflPR549mKwgdzbI9w,1124
|
|
@@ -55,12 +55,12 @@ konduktor/dashboard/frontend/server.js,sha256=jcp6_Ww9YJD3uKY07jR3KMlAM6n1QZdxZn
|
|
|
55
55
|
konduktor/dashboard/frontend/tailwind.config.js,sha256=fCnc48wvioIDOe5ldQ_6RE7F76cP7aU7pDrxBPJx-Fk,366
|
|
56
56
|
konduktor/data/__init__.py,sha256=KMR2i3E9YcIpiIuCxtRdS7BQ1w2vUAbbve7agziJrLo,213
|
|
57
57
|
konduktor/data/aws/__init__.py,sha256=_6zWfNNAK1QGgyKqg_yPYWcXlnffchyvIMErYa6tw_U,331
|
|
58
|
-
konduktor/data/aws/s3.py,sha256=
|
|
58
|
+
konduktor/data/aws/s3.py,sha256=vW79oNoCwKm97iyUQvDScf2i-bXZ6he55UU-kViFa7I,48580
|
|
59
59
|
konduktor/data/constants.py,sha256=yXVEoTI2we1xOjVSU-bjRCQCLpVvpEvJ0GedXvSwEfw,127
|
|
60
60
|
konduktor/data/data_utils.py,sha256=IG1jgb_La997wi90xCvxYYsHQRlmm8Aooq04ZSf8EDI,9670
|
|
61
61
|
konduktor/data/gcp/__init__.py,sha256=rlQxACBC_Vu36mdgPyJgUy4mGc_6Nt_a96JAuaPz2pQ,489
|
|
62
62
|
konduktor/data/gcp/constants.py,sha256=dMfOiFccM8O6rUi9kClJcbvw1K1VnS1JzzQk3apq8ho,1483
|
|
63
|
-
konduktor/data/gcp/gcs.py,sha256=
|
|
63
|
+
konduktor/data/gcp/gcs.py,sha256=ZYHkupCewphSlVwQ5HDAvHG0scwYri9JkklvK9AwcPc,41962
|
|
64
64
|
konduktor/data/gcp/utils.py,sha256=FJQcMXZqtMIzjZ98b3lTTc0UbdPUKTDLsOsfJaaH5-s,214
|
|
65
65
|
konduktor/data/registry.py,sha256=CUbMsN_Q17Pf4wRHkqZrycErEjTP7cLEdgcfwVGcEpc,696
|
|
66
66
|
konduktor/data/storage.py,sha256=o2So-bY9glvgbGdoN7AQNYmNnvGf1AUDPpImtadRL90,35213
|
|
@@ -74,9 +74,9 @@ konduktor/manifests/dmesg_daemonset.yaml,sha256=pSWt7YOeTYjS0l0iki1fvHOs7MhY-sH-
|
|
|
74
74
|
konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw1vcq9_3gwHCu75Bi0A,3939
|
|
75
75
|
konduktor/resource.py,sha256=qQhMlI6gvTaoGfYb9NNgSrUavgNqfcYVfb9V_oC5pLE,20411
|
|
76
76
|
konduktor/serving.py,sha256=sh8TPAUXg23Bkt0ByatIMdxFFqzRm18HJTEkt3wHzdo,5147
|
|
77
|
-
konduktor/task.py,sha256=
|
|
77
|
+
konduktor/task.py,sha256=97iLCo62qpN9wLGNPeFw64E8k1nch7AyySY3BUXHPWY,37496
|
|
78
78
|
konduktor/templates/deployment.yaml.j2,sha256=uXFjDQaimbpFdAn2RJGaIvS_PzDY136cw_L3QMjz3ZA,3452
|
|
79
|
-
konduktor/templates/jobset.yaml.j2,sha256=
|
|
79
|
+
konduktor/templates/jobset.yaml.j2,sha256=67yGuY4XdE4KBWN3DKvMJjlypQ0VpdiioRUAhpa3zA4,1072
|
|
80
80
|
konduktor/templates/pod.yaml.j2,sha256=3uXx0ls2v8x-NL_Ypze5u9RoJS8F5bzoyOJcYwzf8Z0,18240
|
|
81
81
|
konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
82
|
konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
|
|
@@ -84,12 +84,12 @@ konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
84
84
|
konduktor/utils/accelerator_registry.py,sha256=ythz3ynulP1DSSU7Jj5VUsQeBzSYRkxCVDZ5oOg0xtc,560
|
|
85
85
|
konduktor/utils/annotations.py,sha256=oy2-BLydkFt3KWkXDuaGY84d6b7iISuy4eAT9uXk0Fc,2225
|
|
86
86
|
konduktor/utils/base64_utils.py,sha256=mF-Tw98mFRG70YE4w6s9feuQSCYZHOb8YatBZwMugyI,3130
|
|
87
|
-
konduktor/utils/common_utils.py,sha256=
|
|
87
|
+
konduktor/utils/common_utils.py,sha256=8gBpzYiC1bQ8sbgHIFLkKCGT5nLs1afpejod60kVSos,15076
|
|
88
88
|
konduktor/utils/constants.py,sha256=1DneiTR21lvKUcWdBGwC4I4fD4uPjbjLUilEnJS7rzA,216
|
|
89
89
|
konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4xc,2258
|
|
90
90
|
konduktor/utils/exceptions.py,sha256=5IFnN5bIUSBJv4KRRrCepk5jyY9EG5vWWQqbjCmP3NU,6682
|
|
91
91
|
konduktor/utils/kubernetes_enums.py,sha256=SabUueF6Bpzbpa57gyH5VB65xla2N9l8CZmAeYTfGmM,176
|
|
92
|
-
konduktor/utils/kubernetes_utils.py,sha256=
|
|
92
|
+
konduktor/utils/kubernetes_utils.py,sha256=7RThCOiyaALRqbwHZ40qMnBsbAgt669k0NHkxtfx7Bs,26205
|
|
93
93
|
konduktor/utils/log_utils.py,sha256=k4Qo0OlUZYQmLcbSD9tDWe6_Q5XcsLO_K8uVWjlTEU0,16938
|
|
94
94
|
konduktor/utils/loki_utils.py,sha256=h2ZvZQr1nE_wXXsKsGMjhG2s2MXknNd4icydTR_ruKU,3539
|
|
95
95
|
konduktor/utils/rich_utils.py,sha256=ycADW6Ij3wX3uT8ou7T8qxX519RxlkJivsLvUahQaJo,3583
|
|
@@ -97,8 +97,8 @@ konduktor/utils/schemas.py,sha256=tBrKhnkfn9uKDYdlb4L2KgooW-muuhww7U8fu9zX-ms,18
|
|
|
97
97
|
konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
|
|
98
98
|
konduktor/utils/ux_utils.py,sha256=czCwiS1bDqgeKtzAJctczpLwFZzAse7WuozdvzEFYJ4,7437
|
|
99
99
|
konduktor/utils/validator.py,sha256=5C1kE57Eyj1OPnAbvojqMNHHtf5fnl47FK_vEttd8aw,4331
|
|
100
|
-
konduktor_nightly-0.1.0.
|
|
101
|
-
konduktor_nightly-0.1.0.
|
|
102
|
-
konduktor_nightly-0.1.0.
|
|
103
|
-
konduktor_nightly-0.1.0.
|
|
104
|
-
konduktor_nightly-0.1.0.
|
|
100
|
+
konduktor_nightly-0.1.0.dev20250810104857.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
|
|
101
|
+
konduktor_nightly-0.1.0.dev20250810104857.dist-info/METADATA,sha256=0ywusaz5sGzpRJDF0Y_Si5RxQ-R0L49GygxgmhHjRLU,4247
|
|
102
|
+
konduktor_nightly-0.1.0.dev20250810104857.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
103
|
+
konduktor_nightly-0.1.0.dev20250810104857.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
|
|
104
|
+
konduktor_nightly-0.1.0.dev20250810104857.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|