konduktor-nightly 0.1.0.dev20250513105010__tar.gz → 0.1.0.dev20250514104854__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/PKG-INFO +1 -1
  2. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/__init__.py +2 -2
  3. konduktor_nightly-0.1.0.dev20250514104854/konduktor/authentication.py +124 -0
  4. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/backends/jobset.py +20 -19
  5. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/backends/jobset_utils.py +47 -5
  6. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/aws/s3.py +5 -2
  7. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/gcp/gcs.py +5 -2
  8. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/resource.py +18 -0
  9. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/task.py +2 -2
  10. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/templates/pod.yaml.j2 +180 -16
  11. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/exceptions.py +4 -0
  12. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/kubernetes_utils.py +7 -4
  13. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/loki_utils.py +2 -1
  14. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/schemas.py +29 -82
  15. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/pyproject.toml +1 -1
  16. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/LICENSE +0 -0
  17. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/README.md +0 -0
  18. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/adaptors/__init__.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/adaptors/aws.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/adaptors/common.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/adaptors/gcp.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/backends/__init__.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/backends/backend.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/check.py +0 -0
  25. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/cli.py +0 -0
  26. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/config.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/constants.py +0 -0
  28. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/controller/__init__.py +0 -0
  29. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/controller/constants.py +0 -0
  30. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/controller/launch.py +0 -0
  31. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/controller/node.py +0 -0
  32. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/controller/parse.py +0 -0
  33. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/README.md +0 -0
  34. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/backend/main.py +0 -0
  35. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/backend/sockets.py +0 -0
  36. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  37. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/.gitignore +0 -0
  38. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  39. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  40. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  41. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  42. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  43. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  44. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  45. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  46. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  47. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  48. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  49. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  50. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  51. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  52. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  53. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/globals.css +0 -0
  54. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  55. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/layout.js +0 -0
  56. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  57. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/app/page.js +0 -0
  58. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  59. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  60. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/package-lock.json +0 -0
  61. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/package.json +0 -0
  62. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  63. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/server.js +0 -0
  64. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  65. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/__init__.py +0 -0
  66. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/aws/__init__.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/constants.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/data_utils.py +0 -0
  69. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/gcp/__init__.py +0 -0
  70. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/gcp/constants.py +0 -0
  71. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/gcp/utils.py +0 -0
  72. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/registry.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/storage.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/data/storage_utils.py +0 -0
  75. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/execution.py +0 -0
  76. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/kube_client.py +0 -0
  77. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/logging.py +0 -0
  78. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/manifests/controller_deployment.yaml +0 -0
  79. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  80. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  81. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  82. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/templates/jobset.yaml.j2 +0 -0
  83. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/usage/__init__.py +0 -0
  84. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/usage/constants.py +0 -0
  85. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/__init__.py +0 -0
  86. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/accelerator_registry.py +0 -0
  87. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/annotations.py +0 -0
  88. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/base64_utils.py +0 -0
  89. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/common_utils.py +0 -0
  90. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/constants.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/env_options.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/kubernetes_enums.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/log_utils.py +0 -0
  94. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/rich_utils.py +0 -0
  95. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/subprocess_utils.py +0 -0
  96. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/ux_utils.py +0 -0
  97. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250514104854}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250513105010
3
+ Version: 0.1.0.dev20250514104854
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = '2b0d682b6fc8ff0d4e5ea417c4e324090f3c5f9b'
17
+ _KONDUKTOR_COMMIT_SHA = '05c7d9e243ae23c6e9abb0a4a034bfc0815fd587'
18
18
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
19
19
 
20
20
 
@@ -48,5 +48,5 @@ def _get_git_commit():
48
48
 
49
49
 
50
50
  __commit__ = _get_git_commit()
51
- __version__ = '1.0.0.dev0.1.0.dev20250513105010'
51
+ __version__ = '1.0.0.dev0.1.0.dev20250514104854'
52
52
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -0,0 +1,124 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """
14
+ The local machine's public key should not be uploaded to the remote VM, because
15
+ it will cause private/public key pair mismatch when the user tries to launch new
16
+ VM from that remote VM using SkyPilot, e.g., the node is used as a jobs
17
+ controller. (Lambda cloud is an exception, due to the limitation of the cloud
18
+ provider. See the comments in setup_lambda_authentication)
19
+ """
20
+
21
+ import functools
22
+ import os
23
+ from typing import Tuple
24
+
25
+ import filelock
26
+
27
+ from konduktor import logging
28
+ from konduktor.utils import common_utils
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+ _SSH_KEY_PATH_PREFIX = '~/.konduktor/clients/{user_hash}/ssh'
33
+
34
+ MAX_TRIALS = 64
35
+
36
+
37
+ def get_ssh_key_and_lock_path() -> Tuple[str, str, str]:
38
+ user_hash = common_utils.get_user_hash()
39
+ user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
40
+ os.makedirs(os.path.expanduser(user_ssh_key_prefix), exist_ok=True, mode=0o700)
41
+ private_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key')
42
+ public_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key.pub')
43
+ lock_path = os.path.join(user_ssh_key_prefix, '.__internal-konduktor-key.lock')
44
+ return private_key_path, public_key_path, lock_path
45
+
46
+
47
+ def _generate_rsa_key_pair() -> Tuple[str, str]:
48
+ # Keep the import of the cryptography local to avoid expensive
49
+ # third-party imports when not needed.
50
+ # pylint: disable=import-outside-toplevel
51
+ from cryptography.hazmat.backends import default_backend
52
+ from cryptography.hazmat.primitives import serialization
53
+ from cryptography.hazmat.primitives.asymmetric import rsa
54
+
55
+ key = rsa.generate_private_key(
56
+ backend=default_backend(), public_exponent=65537, key_size=2048
57
+ )
58
+
59
+ private_key = (
60
+ key.private_bytes(
61
+ encoding=serialization.Encoding.PEM,
62
+ format=serialization.PrivateFormat.TraditionalOpenSSL,
63
+ encryption_algorithm=serialization.NoEncryption(),
64
+ )
65
+ .decode('utf-8')
66
+ .strip()
67
+ )
68
+
69
+ public_key = (
70
+ key.public_key()
71
+ .public_bytes(
72
+ serialization.Encoding.OpenSSH, serialization.PublicFormat.OpenSSH
73
+ )
74
+ .decode('utf-8')
75
+ .strip()
76
+ )
77
+
78
+ return public_key, private_key
79
+
80
+
81
+ def _save_key_pair(
82
+ private_key_path: str, public_key_path: str, private_key: str, public_key: str
83
+ ) -> None:
84
+ key_dir = os.path.dirname(private_key_path)
85
+ os.makedirs(key_dir, exist_ok=True, mode=0o700)
86
+
87
+ with open(
88
+ private_key_path,
89
+ 'w',
90
+ encoding='utf-8',
91
+ opener=functools.partial(os.open, mode=0o600),
92
+ ) as f:
93
+ f.write(private_key)
94
+
95
+ with open(
96
+ public_key_path,
97
+ 'w',
98
+ encoding='utf-8',
99
+ opener=functools.partial(os.open, mode=0o644),
100
+ ) as f:
101
+ f.write(public_key)
102
+
103
+
104
+ def get_or_generate_keys() -> Tuple[str, str]:
105
+ """Returns the aboslute private and public key paths."""
106
+ private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path()
107
+ private_key_path = os.path.expanduser(private_key_path)
108
+ public_key_path = os.path.expanduser(public_key_path)
109
+ lock_path = os.path.expanduser(lock_path)
110
+
111
+ lock_dir = os.path.dirname(lock_path)
112
+ # We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
113
+ # as the ssh configs will be written to this folder as well in
114
+ # backend_utils.SSHConfigHelper
115
+ os.makedirs(lock_dir, exist_ok=True, mode=0o700)
116
+ with filelock.FileLock(lock_path, timeout=10):
117
+ if not os.path.exists(private_key_path):
118
+ public_key, private_key = _generate_rsa_key_pair()
119
+ _save_key_pair(private_key_path, public_key_path, private_key, public_key)
120
+ assert os.path.exists(public_key_path), (
121
+ 'Private key found, but associated public key '
122
+ f'{public_key_path} does not exist.'
123
+ )
124
+ return private_key_path, public_key_path
@@ -70,25 +70,26 @@ def _wait_for_jobset_start(namespace: str, job_name: str):
70
70
  assert jobsets is not None, (
71
71
  f'Jobset {job_name} ' f'not found in namespace {namespace}'
72
72
  )
73
- if jobsets['status']['replicatedJobsStatus'][0]['ready']:
74
- logger.info(
75
- f'task '
76
- f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
77
- f'{colorama.Style.RESET_ALL} ready'
78
- )
79
- break
80
- elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
81
- return
82
- elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
83
- logger.info(
84
- f'job '
85
- f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
86
- f'{colorama.Style.RESET_ALL} '
87
- f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
88
- )
89
- job = jobset_utils.get_job(namespace, job_name)
90
- _raise_job_error(job)
91
- return
73
+ if 'status' in jobsets:
74
+ if jobsets['status']['replicatedJobsStatus'][0]['ready']:
75
+ logger.info(
76
+ f'task '
77
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
78
+ f'{colorama.Style.RESET_ALL} ready'
79
+ )
80
+ break
81
+ elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
82
+ return
83
+ elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
84
+ logger.info(
85
+ f'job '
86
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
87
+ f'{colorama.Style.RESET_ALL} '
88
+ f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
89
+ )
90
+ job = jobset_utils.get_job(namespace, job_name)
91
+ _raise_job_error(job)
92
+ return
92
93
  if timeout != -1 and time.time() - start > timeout:
93
94
  logger.error(
94
95
  f'{colorama.Style.BRIGHT}'
@@ -1,5 +1,6 @@
1
1
  """Jobset utils: wraps CRUD operations for jobsets"""
2
2
 
3
+ import base64
3
4
  import enum
4
5
  import json
5
6
  import os
@@ -15,7 +16,7 @@ if typing.TYPE_CHECKING:
15
16
  from datetime import timedelta
16
17
 
17
18
  import konduktor
18
- from konduktor import config, constants, kube_client, logging
19
+ from konduktor import authentication, config, constants, kube_client, logging
19
20
  from konduktor.data import registry
20
21
  from konduktor.utils import (
21
22
  common_utils,
@@ -93,6 +94,10 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
93
94
  else:
94
95
  accelerator_type = None
95
96
 
97
+ assert task.resources.cpus is not None, 'Task resources cpus are required'
98
+ assert task.resources.memory is not None, 'Task resources memory are required'
99
+ assert task.resources.image_id is not None, 'Task resources image_id are required'
100
+
96
101
  # template the commands to run on the container for syncing files. At this point
97
102
  # task.stores is Dict[str, storage_utils.Storage] which is (dst, storage_obj_src)
98
103
  # first we iterate through storage_mounts and then file_mounts.
@@ -150,10 +155,35 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
150
155
  f'though specified by `tailscale.secret_name`: {err}'
151
156
  )
152
157
 
153
- assert task.resources is not None, 'Task resources are required'
154
- assert task.resources.cpus is not None, 'Task resources cpus are required'
155
- assert task.resources.memory is not None, 'Task resources memory are required'
156
- assert task.resources.image_id is not None, 'Task resources image_id are required'
158
+ enable_ssh = config.get_nested(('ssh', 'enable'), False)
159
+ secret_name = None
160
+ if enable_ssh:
161
+ private_key_path, public_key_path = authentication.get_or_generate_keys()
162
+ with (
163
+ open(private_key_path, 'rb') as private_key_file,
164
+ open(public_key_path, 'rb') as public_key_file,
165
+ ):
166
+ private_key, public_key = private_key_file.read(), public_key_file.read()
167
+ user_hash = common_utils.get_user_hash()
168
+ context = kubernetes_utils.get_current_kube_config_context_name()
169
+ namespace = kubernetes_utils.get_kube_config_context_namespace(
170
+ context_name=context
171
+ )
172
+ secret_name = f'konduktor-ssh-keys-{user_hash}'
173
+ ok, result = kubernetes_utils.set_secret(
174
+ secret_name=secret_name,
175
+ namespace=namespace,
176
+ context=context,
177
+ data={
178
+ 'PUBKEY': base64.b64encode(public_key).decode(),
179
+ 'PRIVKEY': base64.b64encode(private_key).decode(),
180
+ },
181
+ )
182
+ if not ok:
183
+ raise exceptions.CreateSecretError(
184
+ f'Failed to set k8s secret {secret_name}: \n{result}'
185
+ )
186
+
157
187
  with tempfile.NamedTemporaryFile() as temp:
158
188
  common_utils.fill_template(
159
189
  'pod.yaml.j2',
@@ -166,6 +196,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
166
196
  'master_addr': master_addr,
167
197
  'num_nodes': task.num_nodes,
168
198
  'job_name': task.name, # append timestamp and user id here?
199
+ 'setup_cmd': task.setup or '',
169
200
  'run_cmd': task.run,
170
201
  'node_hostnames': node_hostnames,
171
202
  'accelerator_type': accelerator_type,
@@ -176,6 +207,9 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
176
207
  'user': common_utils.get_cleaned_username(),
177
208
  # Tailscale credentials
178
209
  'tailscale_secret': tailscale_secret,
210
+ # SSH
211
+ 'enable_ssh': enable_ssh,
212
+ 'secret_name': secret_name,
179
213
  },
180
214
  temp.name,
181
215
  )
@@ -183,6 +217,13 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
183
217
  # merge with `~/.konduktor/config.yaml``
184
218
  kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
185
219
  pod_config = common_utils.read_yaml(temp.name)
220
+
221
+ for env_var in pod_config['kubernetes']['pod_config']['spec']['containers'][0][
222
+ 'env'
223
+ ]:
224
+ if env_var['name'] in task.envs:
225
+ env_var['value'] = task.envs.pop(env_var['name'])
226
+
186
227
  for k, v in task.envs.items():
187
228
  pod_config['kubernetes']['pod_config']['spec']['containers'][0][
188
229
  'env'
@@ -221,6 +262,7 @@ def create_jobset(
221
262
  'user': common_utils.get_cleaned_username(),
222
263
  'accelerator_type': accelerator_type,
223
264
  'num_accelerators': num_accelerators,
265
+ 'completions': task.resources.get_completions(),
224
266
  **_JOBSET_METADATA_LABELS,
225
267
  },
226
268
  temp.name,
@@ -1037,8 +1037,11 @@ class S3Store(storage_utils.AbstractStore):
1037
1037
  secret_name=cls._AWS_SECRET_NAME,
1038
1038
  namespace=namespace,
1039
1039
  context=context,
1040
- secret_key=cls._AWS_CREDENTIALS_KEY,
1041
- secret_value=base64_utils.zip_base64encode(credentials_files),
1040
+ data={
1041
+ cls._AWS_CREDENTIALS_KEY: base64_utils.zip_base64encode(
1042
+ credentials_files
1043
+ )
1044
+ },
1042
1045
  )
1043
1046
  if not ok:
1044
1047
  logger.error(f'Failed to set AWS credentials in k8s secret: \n{result}')
@@ -891,8 +891,11 @@ class GcsStore(storage_utils.AbstractStore):
891
891
  secret_name=cls._GCP_SECRET_NAME,
892
892
  namespace=namespace,
893
893
  context=context,
894
- secret_key=cls._GCP_CREDENTIALS_KEY,
895
- secret_value=base64_utils.zip_base64encode(credentials_files),
894
+ data={
895
+ cls._GCP_CREDENTIALS_KEY: base64_utils.zip_base64encode(
896
+ credentials_files
897
+ )
898
+ },
896
899
  )
897
900
  if not ok:
898
901
  logger.error(f'Failed to set GCP credentials in k8s secret: \n{result}')
@@ -49,6 +49,7 @@ class Resources:
49
49
  image_id: Union[str, None] = None,
50
50
  disk_size: Optional[int] = None,
51
51
  labels: Optional[Dict[str, str]] = None,
52
+ job_config: Optional[Dict[str, Union[int, str]]] = None,
52
53
  # Internal use only.
53
54
  # pylint: disable=invalid-name
54
55
  _cluster_config_overrides: Optional[Dict[str, Any]] = None,
@@ -91,6 +92,7 @@ class Resources:
91
92
  instance tags. On GCP, labels map to instance labels. On
92
93
  Kubernetes, labels map to pod labels. On other clouds, labels are
93
94
  not supported and will be ignored.
95
+ job_config: the configuration of the job spec
94
96
  Raises:
95
97
  ValueError: if some attributes are invalid.
96
98
  exceptions.NoCloudAccessError: if no public cloud is enabled.
@@ -122,6 +124,7 @@ class Resources:
122
124
  self._set_cpus(cpus)
123
125
  self._set_memory(memory)
124
126
  self._set_accelerators(accelerators)
127
+ self.job_config = job_config
125
128
 
126
129
  # TODO: move these out of init to prevent repeated calls.
127
130
  self._try_validate_cpus_mem()
@@ -382,6 +385,11 @@ class Resources:
382
385
  accel_str = f'{accel_name}:{accel_count}'
383
386
  return accel_str
384
387
 
388
+ def get_completions(self) -> Optional[int]:
389
+ if self.job_config and self.job_config['completions']:
390
+ return int(self.job_config['completions'])
391
+ return None
392
+
385
393
  def copy(self, **override) -> 'Resources':
386
394
  """Returns a copy of the given Resources."""
387
395
  resources = Resources(
@@ -392,6 +400,7 @@ class Resources:
392
400
  disk_size=override.pop('disk_size', self.disk_size),
393
401
  image_id=override.pop('image_id', self.image_id),
394
402
  labels=override.pop('labels', self.labels),
403
+ job_config=override.pop('job_config', self.job_config),
395
404
  )
396
405
  assert len(override) == 0
397
406
  return resources
@@ -404,6 +413,13 @@ class Resources:
404
413
  config, schemas.get_resources_schema(), 'Invalid resources YAML: '
405
414
  )
406
415
 
416
+ if config.get('job_config', None):
417
+ common_utils.validate_schema(
418
+ config['job_config'],
419
+ schemas.get_job_schema(),
420
+ 'Invalid job config YAML',
421
+ )
422
+
407
423
  def _override_resources(
408
424
  base_resource_config: Dict[str, Any], override_configs: List[Dict[str, Any]]
409
425
  ) -> List[Resources]:
@@ -446,6 +462,7 @@ class Resources:
446
462
  resources_fields['disk_size'] = config.pop('disk_size', None)
447
463
  resources_fields['image_id'] = config.pop('image_id', None)
448
464
  resources_fields['labels'] = config.pop('labels', None)
465
+ resources_fields['job_config'] = config.pop('job_config', None)
449
466
 
450
467
  if resources_fields['cpus'] is not None:
451
468
  resources_fields['cpus'] = str(resources_fields['cpus'])
@@ -475,4 +492,5 @@ class Resources:
475
492
  add_if_not_none('disk_size', self.disk_size)
476
493
  add_if_not_none('image_id', self.image_id)
477
494
  add_if_not_none('labels', self.labels)
495
+ add_if_not_none('job_config', self.job_config)
478
496
  return config
@@ -181,8 +181,7 @@ class Task:
181
181
  """
182
182
  assert name is not None, 'Task name is required'
183
183
  self.name = name
184
- if setup is not None:
185
- raise ValueError('`setup` is being deprecated and not supported')
184
+ self.setup = setup
186
185
  self.run = run
187
186
  self.storage_mounts: Dict[str, storage_lib.Storage] = {}
188
187
  self.storage_plans: Dict[storage_lib.Storage, storage_lib.StoreType] = {}
@@ -320,6 +319,7 @@ class Task:
320
319
 
321
320
  task = Task(
322
321
  config.pop('name', None),
322
+ setup=config.pop('setup', None),
323
323
  run=config.pop('run', None),
324
324
  workdir=config.pop('workdir', None),
325
325
  num_nodes=config.pop('num_nodes', None),