konduktor-nightly 0.1.0.dev20250324104653__py3-none-any.whl → 0.1.0.dev20250326104701__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
konduktor/__init__.py CHANGED
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = '1099bba7fab85c3954acf0b9de385239105f2047'
17
+ _KONDUKTOR_COMMIT_SHA = 'c26d3750b0c2f6f2604b4747a767367294157533'
18
18
 
19
19
 
20
20
  def _get_git_commit():
@@ -47,5 +47,5 @@ def _get_git_commit():
47
47
 
48
48
 
49
49
  __commit__ = _get_git_commit()
50
- __version__ = '1.0.0.dev0.1.0.dev20250324104653'
50
+ __version__ = '1.0.0.dev0.1.0.dev20250326104701'
51
51
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -58,6 +58,7 @@ def _raise_job_error(job):
58
58
 
59
59
 
60
60
  def _wait_for_jobset_start(namespace: str, job_name: str):
61
+ time.sleep(2)
61
62
  start = time.time()
62
63
  timeout = config.get_nested(
63
64
  ('kubernetes', 'provision_timeout'),
@@ -14,7 +14,7 @@ from konduktor import logging
14
14
 
15
15
  logger = logging.get_logger(__name__)
16
16
 
17
- LOKI_PORT = 3100
17
+ LOKI_REMOTE_PORT = 3100
18
18
  WEBSOCKET_TIMEOUT = 10
19
19
  INFINITY = 999999
20
20
 
@@ -23,7 +23,7 @@ async def _read_loki_logs(loki_url: str, timeout: int, job_name: str, worker_id:
23
23
  ws = await asyncio.wait_for(websockets.connect(loki_url), timeout=WEBSOCKET_TIMEOUT)
24
24
  logger.info(
25
25
  f'{colorama.Fore.YELLOW}Tailing logs from Loki. '
26
- f'Forwarding to port {LOKI_PORT}. Press Ctrl+C to stop. '
26
+ f'Forwarding from remote port {LOKI_REMOTE_PORT}. Press Ctrl+C to stop. '
27
27
  f'{colorama.Style.RESET_ALL}'
28
28
  )
29
29
  try:
@@ -44,7 +44,9 @@ async def _read_loki_logs(loki_url: str, timeout: int, job_name: str, worker_id:
44
44
  logger.debug(f'Dropped log: {message}')
45
45
  continue
46
46
  except asyncio.exceptions.TimeoutError:
47
- logger.debug('Websocket timed-out, closing the connection!')
47
+ logger.debug('Websocket timed-out, closing log stream!')
48
+ except KeyboardInterrupt:
49
+ logger.debug('Keyboard interrupt, closing log stream!')
48
50
 
49
51
 
50
52
  def tail_loki_logs_ws(
@@ -57,23 +59,25 @@ def tail_loki_logs_ws(
57
59
  # until we reach the end of the log and then invoke tail again.
58
60
  # Also include checks that the job is running/ever ran.
59
61
  raise ValueError('num_logs must be less than or equal to 5000')
60
- loki_url = f'ws://localhost:{LOKI_PORT}/loki/api/v1/tail'
61
- params = {
62
- 'query': urllib.parse.quote(
63
- f'{{k8s_job_name="{job_name}-workers-0"}} '
64
- f' | batch_kubernetes_io_job_completion_index = `{worker_id}`'
65
- ),
66
- 'limit': num_logs,
67
- 'delay': 5,
68
- # TODO(asaiacai): need to auto-generate the start and end times.
69
- }
70
-
71
- query_string = '&'.join(f'{key}={value}' for key, value in params.items())
72
- loki_url += f'?{query_string}'
73
-
74
62
  loki_svc = kr8s.objects.Service.get('loki', namespace='loki')
75
- timeout = INFINITY if follow else WEBSOCKET_TIMEOUT
76
- with kr8s.portforward.PortForward(loki_svc, LOKI_PORT):
63
+ with kr8s.portforward.PortForward(
64
+ loki_svc, LOKI_REMOTE_PORT, local_port='auto'
65
+ ) as port:
66
+ loki_url = f'ws://localhost:{port}/loki/api/v1/tail'
67
+ logger.debug(f'Loki URL: {loki_url}')
68
+ params = {
69
+ 'query': urllib.parse.quote(
70
+ f'{{k8s_job_name="{job_name}-workers-0"}} '
71
+ f' | batch_kubernetes_io_job_completion_index = `{worker_id}`'
72
+ ),
73
+ 'limit': num_logs,
74
+ 'delay': 5,
75
+ # TODO(asaiacai): need to auto-generate the start and end times.
76
+ }
77
+
78
+ query_string = '&'.join(f'{key}={value}' for key, value in params.items())
79
+ loki_url += f'?{query_string}'
80
+ timeout = INFINITY if follow else WEBSOCKET_TIMEOUT
77
81
  asyncio.run(_read_loki_logs(loki_url, timeout, job_name, worker_id))
78
82
 
79
83
 
@@ -82,4 +86,4 @@ def tail_loki_logs_ws(
82
86
 
83
87
  # Run the WebSocket log tailing function
84
88
  if __name__ == '__main__':
85
- tail_loki_logs_ws('tune-c3c8', worker_id=0, follow=False)
89
+ tail_loki_logs_ws('tune-bc43', worker_id=0, follow=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250324104653
3
+ Version: 0.1.0.dev20250326104701
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -1,10 +1,10 @@
1
- konduktor/__init__.py,sha256=Mgfb4Jmm8gV-yHbe8YzLkWOSZwRg690OOkvZPcBsgZg,1477
1
+ konduktor/__init__.py,sha256=S2aE6nXOYJ8l9jDoTBVddj92D9nF5jB65UiR5_Q1UjE,1477
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/common.py,sha256=mYb_6c3u5MghtiFfiW5OO-EH6t7cIR5npbkgUmz6FYE,3517
4
4
  konduktor/adaptors/gcp.py,sha256=liCm4_D_qSci0DZA2t5bckLIoGDkJ8qx31EO_hSBzo0,3751
5
5
  konduktor/backends/__init__.py,sha256=1Q6sqqdeMYarpTX_U-QVywJYf7idiUTRsyP-E4BQSOw,129
6
6
  konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
7
- konduktor/backends/jobset.py,sha256=BN_2aPuaj_oA9Rd5px61Nc-Dgys7Kt8Oz2z4qdAfhm0,8233
7
+ konduktor/backends/jobset.py,sha256=lh_PihQgM0tmVryCpjSsZjWug8hBnJr7ua9lqk0qEAM,8251
8
8
  konduktor/backends/jobset_utils.py,sha256=FR_IDoDU8noTE1qSG-L0KAIe52geeGtekzhInnmjgwc,16636
9
9
  konduktor/check.py,sha256=hIrxDMKaGX2eZP-Pj9TCymGUHQAp93m48Gj3XMiqadA,7833
10
10
  konduktor/cli.py,sha256=90bnh3nIobfBkzqS_SXgw9Z8Zqh4ouwpLDj0kx_6kL8,23562
@@ -81,14 +81,14 @@ konduktor/utils/exceptions.py,sha256=GBOFIkk9nikqWGR0FXGXOWVVImoH7nWnMl_L3Oux3fo
81
81
  konduktor/utils/kubernetes_enums.py,sha256=SabUueF6Bpzbpa57gyH5VB65xla2N9l8CZmAeYTfGmM,176
82
82
  konduktor/utils/kubernetes_utils.py,sha256=NGBredKPWpZC8VNlwTfWLhHnc-p68d5xlxT-0e92738,23556
83
83
  konduktor/utils/log_utils.py,sha256=lgHCq4OdtJNfbpso-uYGONUCVNsUrUkUWjROarsHt6s,9897
84
- konduktor/utils/loki_utils.py,sha256=SrRwTYHWGfiqqufY2XcKk4imgdxUCFBZ5oxyhCEJI0Y,3221
84
+ konduktor/utils/loki_utils.py,sha256=ND1pbbbFhLhLKw3870j44LpR_9MB0EkDJSs5K7nWdY4,3473
85
85
  konduktor/utils/rich_utils.py,sha256=kdjNe6S2LlpOxyzhFHqMzCz7g4ROC4e7TPWgcbRsrQE,3577
86
86
  konduktor/utils/schemas.py,sha256=4Goihc-NpFQpiJ7RSiKirAIPNWqw_DV_TRqVwejqTDY,17479
87
87
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
88
88
  konduktor/utils/ux_utils.py,sha256=NPNu3Igu2Z9Oq77ghJhy_fIxQZTXWr9BtKyxN3Wslzo,7164
89
89
  konduktor/utils/validator.py,sha256=tgBghVyedyzGx84-U2Qfoh_cJBE3oUk9gclMW90ORks,691
90
- konduktor_nightly-0.1.0.dev20250324104653.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
91
- konduktor_nightly-0.1.0.dev20250324104653.dist-info/METADATA,sha256=_4iDoVPSzhlCx8f0DQaAEBnQ5RS4TiEW5lsvUGMGdsg,4070
92
- konduktor_nightly-0.1.0.dev20250324104653.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
93
- konduktor_nightly-0.1.0.dev20250324104653.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
94
- konduktor_nightly-0.1.0.dev20250324104653.dist-info/RECORD,,
90
+ konduktor_nightly-0.1.0.dev20250326104701.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
91
+ konduktor_nightly-0.1.0.dev20250326104701.dist-info/METADATA,sha256=f-kTqSFtPMyPuMFZGHCM9FZuqZ76U6iWpblqvFiMbXA,4070
92
+ konduktor_nightly-0.1.0.dev20250326104701.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
93
+ konduktor_nightly-0.1.0.dev20250326104701.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
94
+ konduktor_nightly-0.1.0.dev20250326104701.dist-info/RECORD,,