konduktor-nightly 0.1.0.dev20250811105223__py3-none-any.whl → 0.1.0.dev20250813105033__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of konduktor-nightly might be problematic. Click here for more details.

konduktor/__init__.py CHANGED
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = '92fe69bd3f29e7b191de663c598dfcf10738f87a'
14
+ _KONDUKTOR_COMMIT_SHA = 'f4ba2084fac1c1030245b475323f4f3a57fd3fa3'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20250811105223'
48
+ __version__ = '1.0.0.dev0.1.0.dev20250813105033'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -8,6 +8,7 @@ USERID_LABEL = 'trainy.ai/user-id'
8
8
  USER_LABEL = 'trainy.ai/username'
9
9
  ACCELERATOR_LABEL = 'trainy.ai/accelerator'
10
10
  NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
11
+ MAX_EXECUTION_TIME_LABEL = 'kueue.x-k8s.io/max-exec-time-seconds'
11
12
 
12
13
  # Start/stop/status related labels
13
14
  STOP_USERID_LABEL = 'trainy.ai/stop-userid'
@@ -176,7 +176,7 @@ class JobsetBackend(backend.Backend):
176
176
  context = kubernetes_utils.get_current_kube_config_context_name()
177
177
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
178
178
  # TODO(asaiacai): need to set env variables in pod
179
- jobset_utils.create_jobset(
179
+ jobset_response = jobset_utils.create_jobset(
180
180
  namespace,
181
181
  task,
182
182
  pod_spec['kubernetes']['pod_config'],
@@ -192,9 +192,10 @@ class JobsetBackend(backend.Backend):
192
192
  ):
193
193
  _wait_for_jobset_start(namespace, task.name)
194
194
  try:
195
+ assert jobset_response is not None
195
196
  log_thread = threading.Thread(
196
197
  target=log_utils.tail_logs,
197
- args=(task.name,),
198
+ args=(jobset_response,),
198
199
  daemon=True,
199
200
  )
200
201
  logger.info('streaming logs...')
@@ -39,6 +39,7 @@ JOBSET_USERID_LABEL = backend_constants.USERID_LABEL
39
39
  JOBSET_USER_LABEL = backend_constants.USER_LABEL
40
40
  JOBSET_ACCELERATOR_LABEL = backend_constants.ACCELERATOR_LABEL
41
41
  JOBSET_NUM_ACCELERATORS_LABEL = backend_constants.NUM_ACCELERATORS_LABEL
42
+ JOBSET_MAX_EXECUTION_TIME_LABEL = backend_constants.MAX_EXECUTION_TIME_LABEL
42
43
 
43
44
  SECRET_BASENAME_LABEL = backend_constants.SECRET_BASENAME_LABEL
44
45
 
@@ -48,6 +49,7 @@ _JOBSET_METADATA_LABELS = {
48
49
  'jobset_user_label': JOBSET_USER_LABEL,
49
50
  'jobset_accelerator_label': JOBSET_ACCELERATOR_LABEL,
50
51
  'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
52
+ 'jobset_max_execution_time_label': JOBSET_MAX_EXECUTION_TIME_LABEL,
51
53
  }
52
54
 
53
55
 
@@ -79,6 +81,7 @@ def create_jobset(
79
81
  assert task.resources is not None, 'Task resources are undefined'
80
82
  accelerator_type = task.resources.get_accelerator_type() or 'None'
81
83
  num_accelerators = task.resources.get_accelerator_count() or 0
84
+ labels = task.resources.labels if task.resources.labels else {}
82
85
  with tempfile.NamedTemporaryFile() as temp:
83
86
  common_utils.fill_template(
84
87
  'jobset.yaml.j2',
@@ -91,6 +94,7 @@ def create_jobset(
91
94
  'num_accelerators': num_accelerators,
92
95
  'completions': task.resources.get_completions(),
93
96
  'max_restarts': task.resources.get_max_restarts(),
97
+ 'max_execution_time': labels.get('maxRunDurationSeconds', None),
94
98
  **_JOBSET_METADATA_LABELS,
95
99
  },
96
100
  temp.name,
@@ -430,6 +434,36 @@ def _parse_timestamp_filter(timestamp_str: str) -> datetime:
430
434
  )
431
435
 
432
436
 
437
+ def _format_timestamp(timestamp: str) -> str:
438
+ """Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
439
+ # Parse UTC timestamp and convert to local time
440
+ dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
441
+ tzinfo=timezone.utc
442
+ )
443
+ dt_local = dt_utc.astimezone() # Convert to local timezone
444
+ return dt_local.strftime('%m/%d/%y %I:%M%p')
445
+
446
+
447
+ def _get_job_start_time(job: Dict[str, Any]) -> str:
448
+ for condition in job['status']['conditions']:
449
+ if condition['reason'] == 'ResumeJobs':
450
+ return condition.get('lastTransitionTime', '')
451
+ return '-'
452
+
453
+
454
+ def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
455
+ """Extract end time from JobSet conditions (Completed or Failed)"""
456
+ conditions = job.get('status', {}).get('conditions', [])
457
+ for condition in conditions:
458
+ # Look for terminal conditions with status=True
459
+ if (
460
+ condition.get('type') in ['Completed', 'Failed']
461
+ and condition.get('status') == 'True'
462
+ ):
463
+ return condition.get('lastTransitionTime', '')
464
+ return '-'
465
+
466
+
433
467
  def show_status_table(
434
468
  namespace: str,
435
469
  all_users: bool,
@@ -523,15 +557,6 @@ def show_status_table(
523
557
  result = f'{days_str}{hours_str}{minutes_str}{seconds_str}'
524
558
  return result if result else '<1 minute', delta
525
559
 
526
- def _format_timestamp(timestamp: str) -> str:
527
- """Format timestamp as MM/DD/YY HH:MMAM/PM in local timezone"""
528
- # Parse UTC timestamp and convert to local time
529
- dt_utc = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ').replace(
530
- tzinfo=timezone.utc
531
- )
532
- dt_local = dt_utc.astimezone() # Convert to local timezone
533
- return dt_local.strftime('%m/%d/%y %I:%M%p')
534
-
535
560
  def _get_resources(job: Dict[str, Any]) -> str:
536
561
  num_pods = int(
537
562
  job['spec']['replicatedJobs'][0]['template']['spec']['parallelism']
@@ -591,25 +616,17 @@ def show_status_table(
591
616
  if before_dt and job_creation_time >= before_dt:
592
617
  continue
593
618
  # Get start time
594
- start_time = _format_timestamp(job['metadata']['creationTimestamp'])
619
+ start_time = _get_job_start_time(job)
620
+ if start_time != '-':
621
+ start_time = _format_timestamp(start_time)
595
622
 
596
623
  # Get submitted time (how long ago)
597
624
  submitted_time, _ = _get_time_delta(job['metadata']['creationTimestamp'])
598
625
 
599
626
  # Get end time (from JobSet conditions)
600
- def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
601
- """Extract end time from JobSet conditions (Completed or Failed)"""
602
- conditions = job.get('status', {}).get('conditions', [])
603
- for condition in conditions:
604
- # Look for terminal conditions with status=True
605
- if (
606
- condition.get('type') in ['Completed', 'Failed']
607
- and condition.get('status') == 'True'
608
- ):
609
- return _format_timestamp(condition.get('lastTransitionTime', ''))
610
- return '-'
611
-
612
627
  end_time = _get_end_time_from_conditions(job)
628
+ if end_time != '-':
629
+ end_time = _format_timestamp(end_time)
613
630
 
614
631
  if all_users:
615
632
  rows.append(
konduktor/cli.py CHANGED
@@ -732,7 +732,7 @@ def logs(
732
732
  # Verify the job exists before attempting to tail logs
733
733
  # TODO(asaiacai): unify the 404 logic under jobset_utils
734
734
  try:
735
- jobset_utils.get_jobset(namespace, job_id)
735
+ jobset_response = jobset_utils.get_jobset(namespace, job_id)
736
736
  except jobset_utils.JobNotFoundError:
737
737
  raise click.UsageError(
738
738
  f"Job '{job_id}' not found in namespace "
@@ -741,12 +741,9 @@ def logs(
741
741
  f'{colorama.Style.RESET_ALL}.'
742
742
  )
743
743
 
744
- click.secho(
745
- 'Logs are tailed from 1 hour ago, ' 'to see more logs, check Grafana.',
746
- fg='yellow',
747
- )
744
+ assert isinstance(jobset_response, dict), f'jobset_response: {jobset_response}'
748
745
  log_utils.tail_logs(
749
- job_id,
746
+ jobset_response,
750
747
  worker_id=node_rank,
751
748
  follow=follow,
752
749
  num_logs=num_lines,
@@ -11,6 +11,9 @@ jobset:
11
11
  {{ jobset_accelerator_label }}: "{{ accelerator_type }}"
12
12
  {{ jobset_num_accelerators_label }}: "{{ num_accelerators }}"
13
13
  {% endif %}
14
+ {% if max_execution_time %}
15
+ {{ jobset_max_execution_time_label }}: "{{ max_execution_time }}"
16
+ {% endif %}
14
17
  trainy.ai/konduktor-managed: "true"
15
18
  parent: "trainy"
16
19
  annotations: {}
@@ -337,45 +337,51 @@ def tail_loki_logs_ws(
337
337
 
338
338
 
339
339
  def tail_vicky_logs(
340
- job_name: str,
340
+ jobset_response: Dict[str, Any],
341
341
  worker_id: int = 0,
342
- num_logs: int = 1000,
342
+ num_logs: int = -1,
343
343
  follow: bool = True,
344
344
  ):
345
+ job_name = jobset_response['metadata']['name']
345
346
  context = kubernetes_utils.get_current_kube_config_context_name()
346
347
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
347
348
  query: Dict[str, Any] = {}
348
- if num_logs > 5000:
349
- # TODO(asaiacai): we should not have a limit on the number of logs, but rather
350
- # let the user specify any number of lines, and we can print the last N lines.
351
- # this can be done in chunks. Potentially, we can query range
352
- # until we reach the end of the log and then invoke tail again.
353
- # Also include checks that the job is running/ever ran.
354
- raise ValueError('num_logs must be less than or equal to 5000')
355
- logger.info('ignoring num_logs argument for VictoriaLogs')
356
349
  vicky_svc = kr8s.objects.Service.get(
357
350
  'vls-victoria-logs-single-server', namespace='victoria-logs'
358
351
  )
352
+
353
+ if num_logs == -1:
354
+ query = {}
355
+ else:
356
+ assert num_logs > 0, f'num_logs must be greater than 0, got {num_logs}'
357
+ query = {'limit': num_logs}
358
+ if follow:
359
+ logger.info(
360
+ 'No end time found, tailing logs from 1 hour ago. '
361
+ 'If logs come up empty, there might be logs just earlier '
362
+ 'than the past hour, check Grafana or use:\n'
363
+ f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
364
+ f'`konduktor tail --no-follow {job_name}`'
365
+ f'{colorama.Style.RESET_ALL}'
366
+ )
367
+ query['start_offset'] = '1h'
368
+ query['query'] = (
369
+ f'k8s.namespace.name: "{namespace}" AND '
370
+ f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
371
+ f'batch.kubernetes.io/job-completion-index: "{worker_id}"'
372
+ )
373
+
359
374
  with kr8s.portforward.PortForward(
360
375
  vicky_svc, VICKY_REMOTE_PORT, local_port='auto'
361
376
  ) as port:
362
377
  if follow:
363
378
  timeout = INFINITY
364
379
  vicky_url = f'http://localhost:{port}/select/logsql/tail'
365
- query = {}
366
380
  else:
367
381
  vicky_url = f'http://localhost:{port}/select/logsql/query'
368
- query = {'limit': num_logs}
369
382
  timeout = 1
370
383
  logger.debug(f'Vicky URL: {vicky_url}')
371
384
 
372
- query['query'] = (
373
- f'k8s.namespace.name: "{namespace}" AND '
374
- f'batch.kubernetes.io/job-name: "{job_name}-workers-0" AND '
375
- f'batch.kubernetes.io/job-completion-index: "{worker_id}"'
376
- )
377
- query['start_offset'] = '1h'
378
-
379
385
  try:
380
386
  logger.debug(f'Making request to {vicky_url} with query: {query}')
381
387
  with requests.post(
@@ -412,16 +418,17 @@ def tail_vicky_logs(
412
418
 
413
419
 
414
420
  def tail_logs(
415
- job_name: str,
421
+ jobset_response: Dict[str, Any],
416
422
  worker_id: int = 0,
417
423
  num_logs: int = 1000,
418
424
  follow: bool = True,
419
425
  ):
426
+ job_name = jobset_response['metadata']['name']
420
427
  logs_backend = config.get_nested(('logs', 'backend'), None)
421
428
  if logs_backend == LogBackend.VICTORIA:
422
- tail_vicky_logs(job_name, worker_id, num_logs, follow)
429
+ tail_vicky_logs(jobset_response, worker_id, num_logs, follow)
423
430
  elif logs_backend == LogBackend.LOKI:
424
431
  tail_loki_logs_ws(job_name, worker_id, num_logs, follow)
425
432
  else:
426
433
  logger.info('Defaulting to VictoriaLogs')
427
- tail_vicky_logs(job_name, worker_id, num_logs, follow)
434
+ tail_vicky_logs(jobset_response, worker_id, num_logs, follow)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250811105223
3
+ Version: 0.1.0.dev20250813105033
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -1,4 +1,4 @@
1
- konduktor/__init__.py,sha256=DYiQ-TfOdYUqIzT8psdjnWcjtjMI4sbldAghArAY5e0,1574
1
+ konduktor/__init__.py,sha256=_hY0EbT0p1RyIpToDmvNWpK2QTJNNplI92T0AlW7tck,1574
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
4
4
  konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
@@ -6,14 +6,14 @@ konduktor/adaptors/gcp.py,sha256=ierTF4z7vwpJ9BsC7LSiwv4uLcjGXscwZOwQrddr2vM,410
6
6
  konduktor/authentication.py,sha256=_mVy3eqoKohicHostFiGwG1-2ybxP-l7ouofQ0LRlCY,4570
7
7
  konduktor/backends/__init__.py,sha256=usWJ8HdZJEyg7MIsN8Zcz9rk9e2Lq5dWJ8dv6hCN3ys,199
8
8
  konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
9
- konduktor/backends/constants.py,sha256=NfdhY1PQnewvDCjgRKXj6EZDcVH8k_0GGxnMo7w6HDU,666
9
+ konduktor/backends/constants.py,sha256=nt9G9AmFCOMwO4GuKgRQSzJJuKapOmaROp4_Y0tMF5A,732
10
10
  konduktor/backends/deployment.py,sha256=EHfB2uLeKFQ3maek9tx6XL4_sjQ-ax59DZA79Q3EkVs,5519
11
11
  konduktor/backends/deployment_utils.py,sha256=VGuL01rKe7p7PoVRI_cP4tiZRxHZ13nnTMG-bmDf7P0,28975
12
- konduktor/backends/jobset.py,sha256=OwgDog9nH-FoUmNU_H--C3U5jx70reTKL1l849M1k5A,8430
13
- konduktor/backends/jobset_utils.py,sha256=O983a78D411go_F0K2mijZAE1dXAFF7i6aQ7rOrfH7A,24663
12
+ konduktor/backends/jobset.py,sha256=E9THHmcpxTohsx6Goi9mKF4dy_mYpR2DHloSwGVr9jA,8509
13
+ konduktor/backends/jobset_utils.py,sha256=7fB8X4b2Q5BKFCIGME72dyeCfi-EemoMeJVnwtzcjq4,25184
14
14
  konduktor/backends/pod_utils.py,sha256=Jfv_CY8suF0e7QEaeQiNRRxRnOueLgPR8SfLEO7lnwc,15260
15
15
  konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
16
- konduktor/cli.py,sha256=9S6DEsK_qlD34UM6CwFah0FmJgQ4lVaV-LViKp9fJ6o,56687
16
+ konduktor/cli.py,sha256=YD9gMH2ZJykdfrHvzY-DkPQgD-cltahEW141wdI8eiI,56674
17
17
  konduktor/config.py,sha256=9upqgCCYvcu6fKw7tovEYC1MWTkAAir0_WHPdayylbI,15536
18
18
  konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
19
19
  konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -76,7 +76,7 @@ konduktor/resource.py,sha256=qQhMlI6gvTaoGfYb9NNgSrUavgNqfcYVfb9V_oC5pLE,20411
76
76
  konduktor/serving.py,sha256=sh8TPAUXg23Bkt0ByatIMdxFFqzRm18HJTEkt3wHzdo,5147
77
77
  konduktor/task.py,sha256=97iLCo62qpN9wLGNPeFw64E8k1nch7AyySY3BUXHPWY,37496
78
78
  konduktor/templates/deployment.yaml.j2,sha256=uXFjDQaimbpFdAn2RJGaIvS_PzDY136cw_L3QMjz3ZA,3452
79
- konduktor/templates/jobset.yaml.j2,sha256=67yGuY4XdE4KBWN3DKvMJjlypQ0VpdiioRUAhpa3zA4,1072
79
+ konduktor/templates/jobset.yaml.j2,sha256=gURWl6uQv_OLni-LFy2E7ttjGOtuRDt5Vfs4ALH7fpI,1196
80
80
  konduktor/templates/pod.yaml.j2,sha256=3uXx0ls2v8x-NL_Ypze5u9RoJS8F5bzoyOJcYwzf8Z0,18240
81
81
  konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
82
  konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
@@ -90,15 +90,15 @@ konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4x
90
90
  konduktor/utils/exceptions.py,sha256=5IFnN5bIUSBJv4KRRrCepk5jyY9EG5vWWQqbjCmP3NU,6682
91
91
  konduktor/utils/kubernetes_enums.py,sha256=SabUueF6Bpzbpa57gyH5VB65xla2N9l8CZmAeYTfGmM,176
92
92
  konduktor/utils/kubernetes_utils.py,sha256=7RThCOiyaALRqbwHZ40qMnBsbAgt669k0NHkxtfx7Bs,26205
93
- konduktor/utils/log_utils.py,sha256=k4Qo0OlUZYQmLcbSD9tDWe6_Q5XcsLO_K8uVWjlTEU0,16938
93
+ konduktor/utils/log_utils.py,sha256=xg5-NM1l3oodRTkiKihuzwe82g7XnfTzprFPndSF1A8,17032
94
94
  konduktor/utils/loki_utils.py,sha256=h2ZvZQr1nE_wXXsKsGMjhG2s2MXknNd4icydTR_ruKU,3539
95
95
  konduktor/utils/rich_utils.py,sha256=ycADW6Ij3wX3uT8ou7T8qxX519RxlkJivsLvUahQaJo,3583
96
96
  konduktor/utils/schemas.py,sha256=tBrKhnkfn9uKDYdlb4L2KgooW-muuhww7U8fu9zX-ms,18336
97
97
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
98
98
  konduktor/utils/ux_utils.py,sha256=7-Lt3QbDVvBQUli5_U9lOdXKeC-ip8rZBpO9gQ6vPJw,7955
99
99
  konduktor/utils/validator.py,sha256=5C1kE57Eyj1OPnAbvojqMNHHtf5fnl47FK_vEttd8aw,4331
100
- konduktor_nightly-0.1.0.dev20250811105223.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
101
- konduktor_nightly-0.1.0.dev20250811105223.dist-info/METADATA,sha256=Gp9W_UVyGtpg-hU24Hm1DU-RKr-Hkzx10JQmBsrJMdQ,4247
102
- konduktor_nightly-0.1.0.dev20250811105223.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
103
- konduktor_nightly-0.1.0.dev20250811105223.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
104
- konduktor_nightly-0.1.0.dev20250811105223.dist-info/RECORD,,
100
+ konduktor_nightly-0.1.0.dev20250813105033.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
101
+ konduktor_nightly-0.1.0.dev20250813105033.dist-info/METADATA,sha256=SG5D0d2YiNKbLh9yKpMif5NucNJgwSYp-lU55QXL00c,4247
102
+ konduktor_nightly-0.1.0.dev20250813105033.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
103
+ konduktor_nightly-0.1.0.dev20250813105033.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
104
+ konduktor_nightly-0.1.0.dev20250813105033.dist-info/RECORD,,