konduktor-nightly 0.1.0.dev20250323104309__tar.gz → 0.1.0.dev20250325104729__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/PKG-INFO +1 -1
  2. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/backends/jobset.py +1 -0
  4. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/loki_utils.py +24 -20
  5. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/pyproject.toml +5 -1
  6. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/LICENSE +0 -0
  7. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/README.md +0 -0
  8. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/adaptors/__init__.py +0 -0
  9. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/adaptors/common.py +0 -0
  10. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/adaptors/gcp.py +0 -0
  11. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/backends/__init__.py +0 -0
  12. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/backends/backend.py +0 -0
  13. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/backends/jobset_utils.py +0 -0
  14. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/check.py +0 -0
  15. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/cli.py +0 -0
  16. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/cloud_stores.py +0 -0
  17. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/config.py +0 -0
  18. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/constants.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/controller/__init__.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/controller/constants.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/controller/launch.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/controller/node.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/controller/parse.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/README.md +0 -0
  25. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/backend/main.py +0 -0
  26. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/backend/sockets.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  28. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/.gitignore +0 -0
  29. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  30. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  31. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  32. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  33. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  34. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  35. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  36. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  37. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  38. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  39. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  40. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  41. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  42. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  43. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/globals.css +0 -0
  45. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  46. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/layout.js +0 -0
  47. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  48. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/app/page.js +0 -0
  49. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  50. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  51. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/package-lock.json +0 -0
  52. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/package.json +0 -0
  53. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  54. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/server.js +0 -0
  55. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  56. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/data/__init__.py +0 -0
  57. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/data/constants.py +0 -0
  58. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/data/data_utils.py +0 -0
  59. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/data/gcp/__init__.py +0 -0
  60. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/data/gcp/constants.py +0 -0
  61. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/data/gcp/gcs.py +0 -0
  62. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/data/gcp/utils.py +0 -0
  63. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/data/storage.py +0 -0
  64. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/data/storage_utils.py +0 -0
  65. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/execution.py +0 -0
  66. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/kube_client.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/logging.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/manifests/controller_deployment.yaml +0 -0
  69. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  70. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  71. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  72. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/resource.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/task.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/templates/jobset.yaml.j2 +0 -0
  75. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/templates/pod.yaml.j2 +0 -0
  76. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/usage/__init__.py +0 -0
  77. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/usage/constants.py +0 -0
  78. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/__init__.py +0 -0
  79. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/accelerator_registry.py +0 -0
  80. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/annotations.py +0 -0
  81. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/base64_utils.py +0 -0
  82. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/common_utils.py +0 -0
  83. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/constants.py +0 -0
  84. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/env_options.py +0 -0
  85. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/exceptions.py +0 -0
  86. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/kubernetes_enums.py +0 -0
  87. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/kubernetes_utils.py +0 -0
  88. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/log_utils.py +0 -0
  89. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/rich_utils.py +0 -0
  90. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/schemas.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/subprocess_utils.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/ux_utils.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20250323104309 → konduktor_nightly-0.1.0.dev20250325104729}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250323104309
3
+ Version: 0.1.0.dev20250325104729
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = '1099bba7fab85c3954acf0b9de385239105f2047'
17
+ _KONDUKTOR_COMMIT_SHA = 'eeee2ab274425758fdc33addddadd0d9270b9c59'
18
18
 
19
19
 
20
20
  def _get_git_commit():
@@ -47,5 +47,5 @@ def _get_git_commit():
47
47
 
48
48
 
49
49
  __commit__ = _get_git_commit()
50
- __version__ = '1.0.0.dev0.1.0.dev20250323104309'
50
+ __version__ = '1.0.0.dev0.1.0.dev20250325104729'
51
51
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -58,6 +58,7 @@ def _raise_job_error(job):
58
58
 
59
59
 
60
60
  def _wait_for_jobset_start(namespace: str, job_name: str):
61
+ time.sleep(2)
61
62
  start = time.time()
62
63
  timeout = config.get_nested(
63
64
  ('kubernetes', 'provision_timeout'),
@@ -14,7 +14,7 @@ from konduktor import logging
14
14
 
15
15
  logger = logging.get_logger(__name__)
16
16
 
17
- LOKI_PORT = 3100
17
+ LOKI_REMOTE_PORT = 3100
18
18
  WEBSOCKET_TIMEOUT = 10
19
19
  INFINITY = 999999
20
20
 
@@ -23,7 +23,7 @@ async def _read_loki_logs(loki_url: str, timeout: int, job_name: str, worker_id:
23
23
  ws = await asyncio.wait_for(websockets.connect(loki_url), timeout=WEBSOCKET_TIMEOUT)
24
24
  logger.info(
25
25
  f'{colorama.Fore.YELLOW}Tailing logs from Loki. '
26
- f'Forwarding to port {LOKI_PORT}. Press Ctrl+C to stop. '
26
+ f'Forwarding from remote port {LOKI_REMOTE_PORT}. Press Ctrl+C to stop. '
27
27
  f'{colorama.Style.RESET_ALL}'
28
28
  )
29
29
  try:
@@ -44,7 +44,9 @@ async def _read_loki_logs(loki_url: str, timeout: int, job_name: str, worker_id:
44
44
  logger.debug(f'Dropped log: {message}')
45
45
  continue
46
46
  except asyncio.exceptions.TimeoutError:
47
- logger.debug('Websocket timed-out, closing the connection!')
47
+ logger.debug('Websocket timed-out, closing log stream!')
48
+ except KeyboardInterrupt:
49
+ logger.debug('Keyboard interrupt, closing log stream!')
48
50
 
49
51
 
50
52
  def tail_loki_logs_ws(
@@ -57,23 +59,25 @@ def tail_loki_logs_ws(
57
59
  # until we reach the end of the log and then invoke tail again.
58
60
  # Also include checks that the job is running/ever ran.
59
61
  raise ValueError('num_logs must be less than or equal to 5000')
60
- loki_url = f'ws://localhost:{LOKI_PORT}/loki/api/v1/tail'
61
- params = {
62
- 'query': urllib.parse.quote(
63
- f'{{k8s_job_name="{job_name}-workers-0"}} '
64
- f' | batch_kubernetes_io_job_completion_index = `{worker_id}`'
65
- ),
66
- 'limit': num_logs,
67
- 'delay': 5,
68
- # TODO(asaiacai): need to auto-generate the start and end times.
69
- }
70
-
71
- query_string = '&'.join(f'{key}={value}' for key, value in params.items())
72
- loki_url += f'?{query_string}'
73
-
74
62
  loki_svc = kr8s.objects.Service.get('loki', namespace='loki')
75
- timeout = INFINITY if follow else WEBSOCKET_TIMEOUT
76
- with kr8s.portforward.PortForward(loki_svc, LOKI_PORT):
63
+ with kr8s.portforward.PortForward(
64
+ loki_svc, LOKI_REMOTE_PORT, local_port='auto'
65
+ ) as port:
66
+ loki_url = f'ws://localhost:{port}/loki/api/v1/tail'
67
+ logger.debug(f'Loki URL: {loki_url}')
68
+ params = {
69
+ 'query': urllib.parse.quote(
70
+ f'{{k8s_job_name="{job_name}-workers-0"}} '
71
+ f' | batch_kubernetes_io_job_completion_index = `{worker_id}`'
72
+ ),
73
+ 'limit': num_logs,
74
+ 'delay': 5,
75
+ # TODO(asaiacai): need to auto-generate the start and end times.
76
+ }
77
+
78
+ query_string = '&'.join(f'{key}={value}' for key, value in params.items())
79
+ loki_url += f'?{query_string}'
80
+ timeout = INFINITY if follow else WEBSOCKET_TIMEOUT
77
81
  asyncio.run(_read_loki_logs(loki_url, timeout, job_name, worker_id))
78
82
 
79
83
 
@@ -82,4 +86,4 @@ def tail_loki_logs_ws(
82
86
 
83
87
  # Run the WebSocket log tailing function
84
88
  if __name__ == '__main__':
85
- tail_loki_logs_ws('tune-c3c8', worker_id=0, follow=False)
89
+ tail_loki_logs_ws('tune-bc43', worker_id=0, follow=True)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "konduktor-nightly"
3
- version = "0.1.0.dev20250323104309"
3
+ version = "0.1.0.dev20250325104729"
4
4
  description = "GPU Cluster Health Management"
5
5
  packages = [
6
6
  {include = "konduktor"}
@@ -57,6 +57,10 @@ quote-style = "single"
57
57
  [tool.ruff.lint.flake8-quotes]
58
58
  docstring-quotes = "double"
59
59
 
60
+ [tool.ruff.lint.per-file-ignores]
61
+ "tests/smoke_tests/test_basic.py" = ["E501"]
62
+ "tests/test_smoke.py" = ["F403"]
63
+
60
64
  [tool.mypy]
61
65
  python_version = "3.10"
62
66
  follow_imports = "skip"