konduktor-nightly 0.1.0.dev20250819104842__py3-none-any.whl → 0.1.0.dev20250821104804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of konduktor-nightly might be problematic. Click here for more details.

konduktor/__init__.py CHANGED
@@ -11,7 +11,7 @@ from konduktor.task import Task
11
11
  __all__ = ['launch', 'Resources', 'Task', 'Serving']
12
12
 
13
13
  # Replaced with the current commit when building the wheels.
14
- _KONDUKTOR_COMMIT_SHA = '108d7fe47b1bd5db50d555510714d2e204fb7b6f'
14
+ _KONDUKTOR_COMMIT_SHA = 'eee38d922bf4c7cb8a2e6e730092dde0ae372500'
15
15
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
16
16
 
17
17
 
@@ -45,5 +45,5 @@ def _get_git_commit():
45
45
 
46
46
 
47
47
  __commit__ = _get_git_commit()
48
- __version__ = '1.0.0.dev0.1.0.dev20250819104842'
48
+ __version__ = '1.0.0.dev0.1.0.dev20250821104804'
49
49
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -464,6 +464,41 @@ def _get_end_time_from_conditions(job: Dict[str, Any]) -> str:
464
464
  return '-'
465
465
 
466
466
 
467
+ def _get_time_delta(delta: 'timedelta') -> Tuple[str, 'timedelta']:
468
+ total_seconds = int(delta.total_seconds())
469
+
470
+ days, remainder = divmod(total_seconds, 86400) # 86400 seconds in a day
471
+ hours, remainder = divmod(remainder, 3600) # 3600 seconds in an hour
472
+ minutes, seconds = divmod(remainder, 60) # 60 seconds in a minute
473
+
474
+ days_str = f'{days} day{"s" if days != 1 else ""}, ' if days > 0 else ''
475
+ hours_str = f'{hours} hr{"s" if hours != 1 else ""}, ' if hours > 0 else ''
476
+ minutes_str = (
477
+ f'{minutes} min{"s" if minutes != 1 else ""}'
478
+ if minutes > 0 and days == 0
479
+ else ''
480
+ )
481
+
482
+ seconds_str = (
483
+ f'{seconds} sec{"s" if seconds != 1 else ""}'
484
+ if seconds > 0 and days == 0 and hours == 0 and minutes == 0
485
+ else ''
486
+ )
487
+
488
+ result = f'{days_str}{hours_str}{minutes_str}{seconds_str}'
489
+ return result if result else '<1 minute', delta
490
+
491
+
492
+ def _get_job_length(start_time: str, end_time: str) -> str:
493
+ if start_time == '-' or end_time == '-':
494
+ return '-'
495
+ else:
496
+ start = datetime.strptime(start_time, '%m/%d/%y %I:%M%p')
497
+ end = datetime.strptime(end_time, '%m/%d/%y %I:%M%p')
498
+ delta, _ = _get_time_delta(end - start)
499
+ return delta
500
+
501
+
467
502
  def show_status_table(
468
503
  namespace: str,
469
504
  all_users: bool,
@@ -531,32 +566,6 @@ def show_status_table(
531
566
  f'{JobStatus.PENDING.name}{colorama.Style.RESET_ALL}'
532
567
  )
533
568
 
534
- def _get_time_delta(timestamp: str) -> Tuple[str, 'timedelta']:
535
- delta = datetime.now(timezone.utc) - datetime.strptime(
536
- timestamp, '%Y-%m-%dT%H:%M:%SZ'
537
- ).replace(tzinfo=timezone.utc)
538
- total_seconds = int(delta.total_seconds())
539
-
540
- days, remainder = divmod(total_seconds, 86400) # 86400 seconds in a day
541
- hours, remainder = divmod(remainder, 3600) # 3600 seconds in an hour
542
- minutes, seconds = divmod(remainder, 60) # 60 seconds in a minute
543
-
544
- days_str = f'{days} day{"s" if days != 1 else ""}, ' if days > 0 else ''
545
- hours_str = f'{hours} hr{"s" if hours != 1 else ""}, ' if hours > 0 else ''
546
- minutes_str = (
547
- f'{minutes} min{"s" if minutes != 1 else ""}'
548
- if minutes > 0 and days == 0
549
- else ''
550
- )
551
- seconds_str = (
552
- f'{seconds} sec{"s" if seconds != 1 else ""}'
553
- if seconds > 0 and days == 0 and hours == 0 and minutes == 0
554
- else ''
555
- )
556
-
557
- result = f'{days_str}{hours_str}{minutes_str}{seconds_str}'
558
- return result if result else '<1 minute', delta
559
-
560
569
  def _get_resources(job: Dict[str, Any]) -> str:
561
570
  num_pods = int(
562
571
  job['spec']['replicatedJobs'][0]['template']['spec']['parallelism']
@@ -580,9 +589,18 @@ def show_status_table(
580
589
  'SUBMITTED',
581
590
  'START TIME',
582
591
  'END TIME',
592
+ 'DURATION',
583
593
  ]
584
594
  else:
585
- columns = ['NAME', 'STATUS', 'RESOURCES', 'SUBMITTED', 'START TIME', 'END TIME']
595
+ columns = [
596
+ 'NAME',
597
+ 'STATUS',
598
+ 'RESOURCES',
599
+ 'SUBMITTED',
600
+ 'START TIME',
601
+ 'END TIME',
602
+ 'DURATION',
603
+ ]
586
604
  job_table = log_utils.create_table(columns)
587
605
  job_specs = list_jobset(namespace)
588
606
  assert job_specs is not None, 'Retrieving jobs failed'
@@ -621,13 +639,18 @@ def show_status_table(
621
639
  start_time = _format_timestamp(start_time)
622
640
 
623
641
  # Get submitted time (how long ago)
624
- submitted_time, _ = _get_time_delta(job['metadata']['creationTimestamp'])
642
+ time_delta = datetime.now(timezone.utc) - datetime.strptime(
643
+ job['metadata']['creationTimestamp'], '%Y-%m-%dT%H:%M:%SZ'
644
+ ).replace(tzinfo=timezone.utc)
645
+ submitted_time, _ = _get_time_delta(time_delta)
625
646
 
626
647
  # Get end time (from JobSet conditions)
627
648
  end_time = _get_end_time_from_conditions(job)
628
649
  if end_time != '-':
629
650
  end_time = _format_timestamp(end_time)
630
651
 
652
+ job_length = _get_job_length(start_time, end_time)
653
+
631
654
  if all_users:
632
655
  rows.append(
633
656
  [
@@ -638,6 +661,7 @@ def show_status_table(
638
661
  submitted_time,
639
662
  start_time,
640
663
  end_time,
664
+ job_length,
641
665
  job['metadata']['creationTimestamp'],
642
666
  ]
643
667
  )
@@ -654,6 +678,7 @@ def show_status_table(
654
678
  submitted_time,
655
679
  start_time,
656
680
  end_time,
681
+ job_length,
657
682
  job['metadata']['creationTimestamp'],
658
683
  ]
659
684
  )
@@ -1,6 +1,7 @@
1
1
  """Pod utils: handles pod spec creation and manipulation"""
2
2
 
3
3
  import base64
4
+ import json
4
5
  import os
5
6
  import tempfile
6
7
  import typing
@@ -284,7 +285,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
284
285
  pod_config['kubernetes']['pod_config']['spec']['containers'][0]['env'] = list(
285
286
  env_map.values()
286
287
  )
287
- logger.debug(f'rendered pod spec: \n\t{pod_config}')
288
+ logger.debug(f'rendered pod spec: \n\t{json.dumps(pod_config, indent=2)}')
288
289
 
289
290
  # validate pod spec using json schema
290
291
  try:
@@ -161,6 +161,10 @@ kubernetes:
161
161
  - name: git-ssh-secret
162
162
  mountPath: /run/konduktor/git-ssh-secret
163
163
  {% endif %}
164
+ {% if tailscale_secret %}
165
+ - name: tailscale-state
166
+ mountPath: /var/lib/tailscale
167
+ {% endif %}
164
168
  command: ["bash", "-c"]
165
169
  args:
166
170
  - |
@@ -317,18 +321,19 @@ kubernetes:
317
321
  export TS_HOSTNAME=$(echo "$POD_NAME" | sed 's/-[^-]*$//')
318
322
  $(prefix_cmd) echo "TS_HOSTNAME=${TS_HOSTNAME}" >> /etc/environment
319
323
  function InstallTailscale {
320
- while ! tailscale status >/dev/null 2>&1; do
324
+ if ! command -v tailscale >/dev/null 2>&1; then
325
+ $(prefix_cmd) curl -fsSL https://tailscale.com/install.sh | DEBIAN_FRONTEND=noninteractive $(prefix_cmd) sh > ~/.konduktor/tmp/tailscale-install.log 2>&1
326
+ fi
327
+ if ! tailscale status >/dev/null 2>&1; then
321
328
  $(prefix_cmd) mkdir -p /var/run/tailscale /var/cache/tailscale /var/lib/tailscale
322
- if ! command -v tailscale >/dev/null 2>&1; then
323
- $(prefix_cmd) curl -fsSL https://tailscale.com/install.sh | DEBIAN_FRONTEND=noninteractive $(prefix_cmd) sh > ~/.konduktor/tmp/tailscale-install.log 2>&1
324
- fi
325
- $(prefix_cmd) tailscaled --tun=userspace-networking --state=mem: >~/.konduktor/tmp/tailscaled.log 2>&1 &
326
- $(prefix_cmd) sleep 2
327
- $(prefix_cmd) timeout 5 tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME} || echo "tailscale up failed retrying"
328
- $(prefix_cmd) sleep 2
329
+ $(prefix_cmd) nohup tailscaled --tun=userspace-networking >~/.konduktor/tmp/tailscaled.log 2>&1 &
330
+ fi
331
+ until tailscale status >/dev/null 2>&1; do
332
+ $(prefix_cmd) tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME} --accept-dns=false || echo "tailscale up failed retrying"
329
333
  done
330
334
  $(prefix_cmd) echo "Tailscale is up"
331
335
  $(prefix_cmd) tailscale status
336
+ $(prefix_cmd) tailscale netcheck
332
337
  }
333
338
  InstallTailscale | tee ~/.konduktor/tmp/tailscale-out.log
334
339
  {% if konduktor_debug %}
@@ -413,6 +418,10 @@ kubernetes:
413
418
  emptyDir:
414
419
  medium: "Memory"
415
420
  sizeLimit: 4Gi
421
+ {% if tailscale_secret %}
422
+ - name: tailscale-state
423
+ emptyDir: {}
424
+ {% endif %}
416
425
  - name: sync
417
426
  emptyDir: {}
418
427
  {% for secret_type, secret_name in mount_secrets.items() %}
@@ -397,6 +397,8 @@ def tail_vicky_logs(
397
397
  for line in response.iter_lines(decode_unicode=True):
398
398
  if line:
399
399
  payload = json.loads(line)
400
+ if 'missing _msg field' in payload['_msg']:
401
+ payload['_msg'] = ''
400
402
  print(
401
403
  f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT} "
402
404
  f"(job_name={job_name} worker_id={worker_id})"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250819104842
3
+ Version: 0.1.0.dev20250821104804
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -1,4 +1,4 @@
1
- konduktor/__init__.py,sha256=JjXvsKhn2OfPRW0uQtLdrb4JnzbqDdv7tR9E6WsTax4,1574
1
+ konduktor/__init__.py,sha256=vSTzrg0X_r9sPmZDztRtaQbXDhuOpoRTQWuv4z1wybA,1574
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
4
4
  konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
@@ -10,8 +10,8 @@ konduktor/backends/constants.py,sha256=nt9G9AmFCOMwO4GuKgRQSzJJuKapOmaROp4_Y0tMF
10
10
  konduktor/backends/deployment.py,sha256=EHfB2uLeKFQ3maek9tx6XL4_sjQ-ax59DZA79Q3EkVs,5519
11
11
  konduktor/backends/deployment_utils.py,sha256=VGuL01rKe7p7PoVRI_cP4tiZRxHZ13nnTMG-bmDf7P0,28975
12
12
  konduktor/backends/jobset.py,sha256=OwgDog9nH-FoUmNU_H--C3U5jx70reTKL1l849M1k5A,8430
13
- konduktor/backends/jobset_utils.py,sha256=7fB8X4b2Q5BKFCIGME72dyeCfi-EemoMeJVnwtzcjq4,25184
14
- konduktor/backends/pod_utils.py,sha256=Jfv_CY8suF0e7QEaeQiNRRxRnOueLgPR8SfLEO7lnwc,15260
13
+ konduktor/backends/jobset_utils.py,sha256=8YUIFg7mi33A5p_K9GhVdjllTdXRMj_GEg-iQ5Nj_iU,25708
14
+ konduktor/backends/pod_utils.py,sha256=EkoyN4mMfnrwEG656J1kxPtNZQClo4X7JEyzV1F8oHg,15294
15
15
  konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
16
16
  konduktor/cli.py,sha256=YW9vNlbayDIcHk5D8-qsVGwoOdRV1o9kFmF2jtBf3vE,56641
17
17
  konduktor/config.py,sha256=9upqgCCYvcu6fKw7tovEYC1MWTkAAir0_WHPdayylbI,15536
@@ -77,7 +77,7 @@ konduktor/serving.py,sha256=sh8TPAUXg23Bkt0ByatIMdxFFqzRm18HJTEkt3wHzdo,5147
77
77
  konduktor/task.py,sha256=97iLCo62qpN9wLGNPeFw64E8k1nch7AyySY3BUXHPWY,37496
78
78
  konduktor/templates/deployment.yaml.j2,sha256=uXFjDQaimbpFdAn2RJGaIvS_PzDY136cw_L3QMjz3ZA,3452
79
79
  konduktor/templates/jobset.yaml.j2,sha256=gURWl6uQv_OLni-LFy2E7ttjGOtuRDt5Vfs4ALH7fpI,1196
80
- konduktor/templates/pod.yaml.j2,sha256=gxBaHFwDfRE71nh3glYPsmirOr6Qn__-f6oHmQRP4QU,18809
80
+ konduktor/templates/pod.yaml.j2,sha256=owid7Wpo-kmQ-Gvx1zVoyG_feOiKviGDsnWLTlReXPo,19086
81
81
  konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
82
  konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
83
83
  konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -90,15 +90,15 @@ konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4x
90
90
  konduktor/utils/exceptions.py,sha256=5IFnN5bIUSBJv4KRRrCepk5jyY9EG5vWWQqbjCmP3NU,6682
91
91
  konduktor/utils/kubernetes_enums.py,sha256=SabUueF6Bpzbpa57gyH5VB65xla2N9l8CZmAeYTfGmM,176
92
92
  konduktor/utils/kubernetes_utils.py,sha256=7RThCOiyaALRqbwHZ40qMnBsbAgt669k0NHkxtfx7Bs,26205
93
- konduktor/utils/log_utils.py,sha256=jxm9ovPcJPOGfd2wOwPDEThRO25ETIV5a1DmAfLhqJc,16861
93
+ konduktor/utils/log_utils.py,sha256=k43eGpSwIdGhNotC8w7_Hq-bbyQQTrHwMOAMct_gr9M,16978
94
94
  konduktor/utils/loki_utils.py,sha256=h2ZvZQr1nE_wXXsKsGMjhG2s2MXknNd4icydTR_ruKU,3539
95
95
  konduktor/utils/rich_utils.py,sha256=ycADW6Ij3wX3uT8ou7T8qxX519RxlkJivsLvUahQaJo,3583
96
96
  konduktor/utils/schemas.py,sha256=tBrKhnkfn9uKDYdlb4L2KgooW-muuhww7U8fu9zX-ms,18336
97
97
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
98
98
  konduktor/utils/ux_utils.py,sha256=7-Lt3QbDVvBQUli5_U9lOdXKeC-ip8rZBpO9gQ6vPJw,7955
99
99
  konduktor/utils/validator.py,sha256=5C1kE57Eyj1OPnAbvojqMNHHtf5fnl47FK_vEttd8aw,4331
100
- konduktor_nightly-0.1.0.dev20250819104842.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
101
- konduktor_nightly-0.1.0.dev20250819104842.dist-info/METADATA,sha256=YevrR3deaLynmnMl_mbF4nNSpMXXgL0PhHlnRS3Xlu8,4247
102
- konduktor_nightly-0.1.0.dev20250819104842.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
103
- konduktor_nightly-0.1.0.dev20250819104842.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
104
- konduktor_nightly-0.1.0.dev20250819104842.dist-info/RECORD,,
100
+ konduktor_nightly-0.1.0.dev20250821104804.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
101
+ konduktor_nightly-0.1.0.dev20250821104804.dist-info/METADATA,sha256=qQPp3F_j37Mx4Ios1of0b_961SL9jkOJd-5sZ-_mFwE,4247
102
+ konduktor_nightly-0.1.0.dev20250821104804.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
103
+ konduktor_nightly-0.1.0.dev20250821104804.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
104
+ konduktor_nightly-0.1.0.dev20250821104804.dist-info/RECORD,,