konduktor-nightly 0.1.0.dev20250326104701__tar.gz → 0.1.0.dev20250327104656__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/PKG-INFO +1 -1
  2. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/backends/jobset_utils.py +11 -0
  4. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/templates/jobset.yaml.j2 +1 -0
  5. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/templates/pod.yaml.j2 +14 -4
  6. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/pyproject.toml +1 -1
  7. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/LICENSE +0 -0
  8. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/README.md +0 -0
  9. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/adaptors/__init__.py +0 -0
  10. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/adaptors/common.py +0 -0
  11. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/adaptors/gcp.py +0 -0
  12. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/backends/__init__.py +0 -0
  13. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/backends/backend.py +0 -0
  14. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/backends/jobset.py +0 -0
  15. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/check.py +0 -0
  16. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/cli.py +0 -0
  17. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/cloud_stores.py +0 -0
  18. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/config.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/constants.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/controller/__init__.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/controller/constants.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/controller/launch.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/controller/node.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/controller/parse.py +0 -0
  25. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/README.md +0 -0
  26. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/backend/main.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/backend/sockets.py +0 -0
  28. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  29. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/.gitignore +0 -0
  30. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  31. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  32. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  33. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  34. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  35. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  36. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  37. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  38. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  39. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  40. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  41. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  42. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  43. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  44. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  45. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/globals.css +0 -0
  46. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  47. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/layout.js +0 -0
  48. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  49. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/app/page.js +0 -0
  50. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  51. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  52. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/package-lock.json +0 -0
  53. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/package.json +0 -0
  54. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  55. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/server.js +0 -0
  56. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  57. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/data/__init__.py +0 -0
  58. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/data/constants.py +0 -0
  59. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/data/data_utils.py +0 -0
  60. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/data/gcp/__init__.py +0 -0
  61. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/data/gcp/constants.py +0 -0
  62. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/data/gcp/gcs.py +0 -0
  63. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/data/gcp/utils.py +0 -0
  64. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/data/storage.py +0 -0
  65. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/data/storage_utils.py +0 -0
  66. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/execution.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/kube_client.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/logging.py +0 -0
  69. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/manifests/controller_deployment.yaml +0 -0
  70. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  71. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  72. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  73. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/resource.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/task.py +0 -0
  75. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/usage/__init__.py +0 -0
  76. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/usage/constants.py +0 -0
  77. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/__init__.py +0 -0
  78. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/accelerator_registry.py +0 -0
  79. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/annotations.py +0 -0
  80. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/base64_utils.py +0 -0
  81. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/common_utils.py +0 -0
  82. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/constants.py +0 -0
  83. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/env_options.py +0 -0
  84. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/exceptions.py +0 -0
  85. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/kubernetes_enums.py +0 -0
  86. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/kubernetes_utils.py +0 -0
  87. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/log_utils.py +0 -0
  88. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/loki_utils.py +0 -0
  89. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/rich_utils.py +0 -0
  90. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/schemas.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/subprocess_utils.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/ux_utils.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20250326104701 → konduktor_nightly-0.1.0.dev20250327104656}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250326104701
3
+ Version: 0.1.0.dev20250327104656
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = 'c26d3750b0c2f6f2604b4747a767367294157533'
17
+ _KONDUKTOR_COMMIT_SHA = 'ac5a9936d3c1b6a76bffd3f660d68ac80634bda5'
18
18
 
19
19
 
20
20
  def _get_git_commit():
@@ -47,5 +47,5 @@ def _get_git_commit():
47
47
 
48
48
 
49
49
  __commit__ = _get_git_commit()
50
- __version__ = '1.0.0.dev0.1.0.dev20250326104701'
50
+ __version__ = '1.0.0.dev0.1.0.dev20250327104656'
51
51
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -30,6 +30,7 @@ JOBSET_USER_LABEL = 'trainy.ai/username'
30
30
  JOBSET_ACCELERATOR_LABEL = 'trainy.ai/accelerator'
31
31
  JOBSET_NUM_ACCELERATORS_LABEL = 'trainy.ai/num-accelerators'
32
32
 
33
+
33
34
  _JOBSET_METADATA_LABELS = {
34
35
  'jobset_name_label': JOBSET_NAME_LABEL,
35
36
  'jobset_userid_label': JOBSET_USERID_LABEL,
@@ -37,6 +38,8 @@ _JOBSET_METADATA_LABELS = {
37
38
  'jobset_accelerator_label': JOBSET_ACCELERATOR_LABEL,
38
39
  'jobset_num_accelerators_label': JOBSET_NUM_ACCELERATORS_LABEL,
39
40
  }
41
+ _RUN_DURATION_ANNOTATION = 'maxRunDurationSeconds'
42
+ _RUN_DURATION_ANNOTATION_KEY = 'kueue.x-k8s.io/maxRunDurationSeconds'
40
43
 
41
44
 
42
45
  class JobNotFoundError(Exception):
@@ -196,6 +199,14 @@ def create_jobset(
196
199
  temp.name,
197
200
  )
198
201
  jobset_spec = common_utils.read_yaml(temp.name)
202
+ jobset_spec['jobset']['metadata']['labels'].update(**task.resources.labels)
203
+ assert task.resources.labels is not None
204
+ maxRunDurationSeconds = task.resources.labels.get('maxRunDurationSeconds', None)
205
+ if not maxRunDurationSeconds:
206
+ raise ValueError('maxRunDurationSeconds is required')
207
+ jobset_spec['jobset']['metadata']['annotations'][
208
+ _RUN_DURATION_ANNOTATION_KEY
209
+ ] = str(maxRunDurationSeconds)
199
210
  jobset_spec['jobset']['spec']['replicatedJobs'][0]['template']['spec'][
200
211
  'template'
201
212
  ] = pod_spec # noqa: E501
@@ -13,6 +13,7 @@ jobset:
13
13
  {% endif %}
14
14
  trainy.ai/konduktor-managed: "true"
15
15
  parent: "trainy"
16
+ annotations: {}
16
17
  spec:
17
18
  ttlSecondsAfterFinished: 259200 # 3 days
18
19
  replicatedJobs:
@@ -71,10 +71,13 @@ kubernetes:
71
71
  [ $(id -u) -eq 0 ] && function sudo() { "$@"; } || true;
72
72
 
73
73
 
74
- # Run apt update, install missing packages
75
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > ~/.konduktor/tmp/apt-update.log 2>&1 || \
76
- $(prefix_cmd) echo "Warning: apt-get update failed. Continuing anyway..." >> ~/.konduktor/tmp/apt-update.log
77
- PACKAGES="rsync curl";
74
+ PACKAGES="";
75
+ {% if 'rsync' in run_cmd %}
76
+ PACKAGES="$PACKAGES rsync";
77
+ {% endif %}
78
+ {% if 'curl' in run_cmd %}
79
+ PACKAGES="$PACKAGES curl";
80
+ {% endif %}
78
81
  {% if 'gs' in mount_secrets %}
79
82
  PACKAGES="$PACKAGES unzip wget";
80
83
  {% endif %}
@@ -82,6 +85,13 @@ kubernetes:
82
85
  PACKAGES="$PACKAGES git";
83
86
  {% endif %}
84
87
 
88
+ if [ -z "${PACKAGES}" ]; then
89
+ # Run apt update, install missing packages
90
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > ~/.konduktor/tmp/apt-update.log 2>&1 || \
91
+ $(prefix_cmd) echo "Warning: apt-get update failed. Continuing anyway..." >> ~/.konduktor/tmp/apt-update.log
92
+ fi
93
+
94
+
85
95
  # Separate packages into two groups: packages that are installed first
86
96
  # so that curl and rsync are available sooner to unblock the following
87
97
  # conda installation and rsync.
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "konduktor-nightly"
3
- version = "0.1.0.dev20250326104701"
3
+ version = "0.1.0.dev20250327104656"
4
4
  description = "GPU Cluster Health Management"
5
5
  packages = [
6
6
  {include = "konduktor"}