konduktor-nightly 0.1.0.dev20250513105010__tar.gz → 0.1.0.dev20250515104942__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/PKG-INFO +1 -1
  2. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/adaptors/common.py +1 -2
  4. konduktor_nightly-0.1.0.dev20250515104942/konduktor/authentication.py +124 -0
  5. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/backends/jobset.py +20 -19
  6. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/backends/jobset_utils.py +47 -5
  7. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/cli.py +2 -3
  8. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/aws/s3.py +5 -2
  9. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/data_utils.py +2 -2
  10. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/gcp/gcs.py +5 -2
  11. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/storage.py +6 -7
  12. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/execution.py +2 -2
  13. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/kube_client.py +0 -3
  14. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/resource.py +18 -0
  15. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/task.py +3 -3
  16. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/templates/pod.yaml.j2 +182 -16
  17. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/exceptions.py +5 -1
  18. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/kubernetes_utils.py +7 -4
  19. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/log_utils.py +0 -2
  20. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/loki_utils.py +2 -1
  21. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/rich_utils.py +1 -1
  22. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/schemas.py +29 -82
  23. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/pyproject.toml +1 -1
  24. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/LICENSE +0 -0
  25. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/README.md +0 -0
  26. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/adaptors/__init__.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/adaptors/aws.py +0 -0
  28. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/adaptors/gcp.py +0 -0
  29. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/backends/__init__.py +0 -0
  30. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/backends/backend.py +0 -0
  31. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/check.py +0 -0
  32. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/config.py +0 -0
  33. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/constants.py +0 -0
  34. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/controller/__init__.py +0 -0
  35. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/controller/constants.py +0 -0
  36. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/controller/launch.py +0 -0
  37. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/controller/node.py +0 -0
  38. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/controller/parse.py +0 -0
  39. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/README.md +0 -0
  40. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/backend/main.py +0 -0
  41. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/backend/sockets.py +0 -0
  42. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  43. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/.gitignore +0 -0
  44. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  45. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  46. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  47. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  48. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  49. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  50. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  51. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  52. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  53. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  54. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  55. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  56. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  57. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  58. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  59. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/globals.css +0 -0
  60. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  61. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/layout.js +0 -0
  62. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  63. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/app/page.js +0 -0
  64. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  65. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  66. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/package-lock.json +0 -0
  67. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/package.json +0 -0
  68. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  69. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/server.js +0 -0
  70. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  71. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/__init__.py +0 -0
  72. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/aws/__init__.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/constants.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/gcp/__init__.py +0 -0
  75. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/gcp/constants.py +0 -0
  76. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/gcp/utils.py +0 -0
  77. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/registry.py +0 -0
  78. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/data/storage_utils.py +0 -0
  79. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/logging.py +0 -0
  80. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/manifests/controller_deployment.yaml +0 -0
  81. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  82. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  83. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  84. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/templates/jobset.yaml.j2 +0 -0
  85. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/usage/__init__.py +0 -0
  86. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/usage/constants.py +0 -0
  87. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/__init__.py +0 -0
  88. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/accelerator_registry.py +0 -0
  89. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/annotations.py +0 -0
  90. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/base64_utils.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/common_utils.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/constants.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/env_options.py +0 -0
  94. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/kubernetes_enums.py +0 -0
  95. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/subprocess_utils.py +0 -0
  96. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/ux_utils.py +0 -0
  97. {konduktor_nightly-0.1.0.dev20250513105010 → konduktor_nightly-0.1.0.dev20250515104942}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250513105010
3
+ Version: 0.1.0.dev20250515104942
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = '2b0d682b6fc8ff0d4e5ea417c4e324090f3c5f9b'
17
+ _KONDUKTOR_COMMIT_SHA = 'c0bd8e8774fab8042721b43a8cb8c35a624f8299'
18
18
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
19
19
 
20
20
 
@@ -48,5 +48,5 @@ def _get_git_commit():
48
48
 
49
49
 
50
50
  __commit__ = _get_git_commit()
51
- __version__ = '1.0.0.dev0.1.0.dev20250513105010'
51
+ __version__ = '1.0.0.dev0.1.0.dev20250515104942'
52
52
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -26,8 +26,7 @@ class LazyImport:
26
26
 
27
27
  We use this for pandas and networkx, as they can be time-consuming to import
28
28
  (0.1-0.2 seconds). With this class, we can avoid the unnecessary import time
29
- when the module is not used (e.g., `networkx` should not be imported for
30
- `sky status and `pandas` should not be imported for `sky exec`).
29
+ when the module is not used.
31
30
 
32
31
  We also use this for cloud adaptors, because we do not want to import the
33
32
  cloud dependencies when it is not enabled.
@@ -0,0 +1,124 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """
14
+ The local machine's public key should not be uploaded to the remote VM, because
15
+ it will cause private/public key pair mismatch when the user tries to launch new
16
+ VM from that remote VM using SkyPilot, e.g., the node is used as a jobs
17
+ controller. (Lambda cloud is an exception, due to the limitation of the cloud
18
+ provider. See the comments in setup_lambda_authentication)
19
+ """
20
+
21
+ import functools
22
+ import os
23
+ from typing import Tuple
24
+
25
+ import filelock
26
+
27
+ from konduktor import logging
28
+ from konduktor.utils import common_utils
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+ _SSH_KEY_PATH_PREFIX = '~/.konduktor/clients/{user_hash}/ssh'
33
+
34
+ MAX_TRIALS = 64
35
+
36
+
37
+ def get_ssh_key_and_lock_path() -> Tuple[str, str, str]:
38
+ user_hash = common_utils.get_user_hash()
39
+ user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
40
+ os.makedirs(os.path.expanduser(user_ssh_key_prefix), exist_ok=True, mode=0o700)
41
+ private_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key')
42
+ public_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key.pub')
43
+ lock_path = os.path.join(user_ssh_key_prefix, '.__internal-konduktor-key.lock')
44
+ return private_key_path, public_key_path, lock_path
45
+
46
+
47
+ def _generate_rsa_key_pair() -> Tuple[str, str]:
48
+ # Keep the import of the cryptography local to avoid expensive
49
+ # third-party imports when not needed.
50
+ # pylint: disable=import-outside-toplevel
51
+ from cryptography.hazmat.backends import default_backend
52
+ from cryptography.hazmat.primitives import serialization
53
+ from cryptography.hazmat.primitives.asymmetric import rsa
54
+
55
+ key = rsa.generate_private_key(
56
+ backend=default_backend(), public_exponent=65537, key_size=2048
57
+ )
58
+
59
+ private_key = (
60
+ key.private_bytes(
61
+ encoding=serialization.Encoding.PEM,
62
+ format=serialization.PrivateFormat.TraditionalOpenSSL,
63
+ encryption_algorithm=serialization.NoEncryption(),
64
+ )
65
+ .decode('utf-8')
66
+ .strip()
67
+ )
68
+
69
+ public_key = (
70
+ key.public_key()
71
+ .public_bytes(
72
+ serialization.Encoding.OpenSSH, serialization.PublicFormat.OpenSSH
73
+ )
74
+ .decode('utf-8')
75
+ .strip()
76
+ )
77
+
78
+ return public_key, private_key
79
+
80
+
81
+ def _save_key_pair(
82
+ private_key_path: str, public_key_path: str, private_key: str, public_key: str
83
+ ) -> None:
84
+ key_dir = os.path.dirname(private_key_path)
85
+ os.makedirs(key_dir, exist_ok=True, mode=0o700)
86
+
87
+ with open(
88
+ private_key_path,
89
+ 'w',
90
+ encoding='utf-8',
91
+ opener=functools.partial(os.open, mode=0o600),
92
+ ) as f:
93
+ f.write(private_key)
94
+
95
+ with open(
96
+ public_key_path,
97
+ 'w',
98
+ encoding='utf-8',
99
+ opener=functools.partial(os.open, mode=0o644),
100
+ ) as f:
101
+ f.write(public_key)
102
+
103
+
104
+ def get_or_generate_keys() -> Tuple[str, str]:
105
+ """Returns the aboslute private and public key paths."""
106
+ private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path()
107
+ private_key_path = os.path.expanduser(private_key_path)
108
+ public_key_path = os.path.expanduser(public_key_path)
109
+ lock_path = os.path.expanduser(lock_path)
110
+
111
+ lock_dir = os.path.dirname(lock_path)
112
+ # We should have the folder ~/.konduktor/generated/ssh to have 0o700 permission,
113
+ # as the ssh configs will be written to this folder as well in
114
+ # backend_utils.SSHConfigHelper
115
+ os.makedirs(lock_dir, exist_ok=True, mode=0o700)
116
+ with filelock.FileLock(lock_path, timeout=10):
117
+ if not os.path.exists(private_key_path):
118
+ public_key, private_key = _generate_rsa_key_pair()
119
+ _save_key_pair(private_key_path, public_key_path, private_key, public_key)
120
+ assert os.path.exists(public_key_path), (
121
+ 'Private key found, but associated public key '
122
+ f'{public_key_path} does not exist.'
123
+ )
124
+ return private_key_path, public_key_path
@@ -70,25 +70,26 @@ def _wait_for_jobset_start(namespace: str, job_name: str):
70
70
  assert jobsets is not None, (
71
71
  f'Jobset {job_name} ' f'not found in namespace {namespace}'
72
72
  )
73
- if jobsets['status']['replicatedJobsStatus'][0]['ready']:
74
- logger.info(
75
- f'task '
76
- f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
77
- f'{colorama.Style.RESET_ALL} ready'
78
- )
79
- break
80
- elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
81
- return
82
- elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
83
- logger.info(
84
- f'job '
85
- f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
86
- f'{colorama.Style.RESET_ALL} '
87
- f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
88
- )
89
- job = jobset_utils.get_job(namespace, job_name)
90
- _raise_job_error(job)
91
- return
73
+ if 'status' in jobsets:
74
+ if jobsets['status']['replicatedJobsStatus'][0]['ready']:
75
+ logger.info(
76
+ f'task '
77
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
78
+ f'{colorama.Style.RESET_ALL} ready'
79
+ )
80
+ break
81
+ elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
82
+ return
83
+ elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
84
+ logger.info(
85
+ f'job '
86
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
87
+ f'{colorama.Style.RESET_ALL} '
88
+ f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
89
+ )
90
+ job = jobset_utils.get_job(namespace, job_name)
91
+ _raise_job_error(job)
92
+ return
92
93
  if timeout != -1 and time.time() - start > timeout:
93
94
  logger.error(
94
95
  f'{colorama.Style.BRIGHT}'
@@ -1,5 +1,6 @@
1
1
  """Jobset utils: wraps CRUD operations for jobsets"""
2
2
 
3
+ import base64
3
4
  import enum
4
5
  import json
5
6
  import os
@@ -15,7 +16,7 @@ if typing.TYPE_CHECKING:
15
16
  from datetime import timedelta
16
17
 
17
18
  import konduktor
18
- from konduktor import config, constants, kube_client, logging
19
+ from konduktor import authentication, config, constants, kube_client, logging
19
20
  from konduktor.data import registry
20
21
  from konduktor.utils import (
21
22
  common_utils,
@@ -93,6 +94,10 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
93
94
  else:
94
95
  accelerator_type = None
95
96
 
97
+ assert task.resources.cpus is not None, 'Task resources cpus are required'
98
+ assert task.resources.memory is not None, 'Task resources memory are required'
99
+ assert task.resources.image_id is not None, 'Task resources image_id are required'
100
+
96
101
  # template the commands to run on the container for syncing files. At this point
97
102
  # task.stores is Dict[str, storage_utils.Storage] which is (dst, storage_obj_src)
98
103
  # first we iterate through storage_mounts and then file_mounts.
@@ -150,10 +155,35 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
150
155
  f'though specified by `tailscale.secret_name`: {err}'
151
156
  )
152
157
 
153
- assert task.resources is not None, 'Task resources are required'
154
- assert task.resources.cpus is not None, 'Task resources cpus are required'
155
- assert task.resources.memory is not None, 'Task resources memory are required'
156
- assert task.resources.image_id is not None, 'Task resources image_id are required'
158
+ enable_ssh = config.get_nested(('ssh', 'enable'), False)
159
+ secret_name = None
160
+ if enable_ssh:
161
+ private_key_path, public_key_path = authentication.get_or_generate_keys()
162
+ with (
163
+ open(private_key_path, 'rb') as private_key_file,
164
+ open(public_key_path, 'rb') as public_key_file,
165
+ ):
166
+ private_key, public_key = private_key_file.read(), public_key_file.read()
167
+ user_hash = common_utils.get_user_hash()
168
+ context = kubernetes_utils.get_current_kube_config_context_name()
169
+ namespace = kubernetes_utils.get_kube_config_context_namespace(
170
+ context_name=context
171
+ )
172
+ secret_name = f'konduktor-ssh-keys-{user_hash}'
173
+ ok, result = kubernetes_utils.set_secret(
174
+ secret_name=secret_name,
175
+ namespace=namespace,
176
+ context=context,
177
+ data={
178
+ 'PUBKEY': base64.b64encode(public_key).decode(),
179
+ 'PRIVKEY': base64.b64encode(private_key).decode(),
180
+ },
181
+ )
182
+ if not ok:
183
+ raise exceptions.CreateSecretError(
184
+ f'Failed to set k8s secret {secret_name}: \n{result}'
185
+ )
186
+
157
187
  with tempfile.NamedTemporaryFile() as temp:
158
188
  common_utils.fill_template(
159
189
  'pod.yaml.j2',
@@ -166,6 +196,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
166
196
  'master_addr': master_addr,
167
197
  'num_nodes': task.num_nodes,
168
198
  'job_name': task.name, # append timestamp and user id here?
199
+ 'setup_cmd': task.setup or '',
169
200
  'run_cmd': task.run,
170
201
  'node_hostnames': node_hostnames,
171
202
  'accelerator_type': accelerator_type,
@@ -176,6 +207,9 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
176
207
  'user': common_utils.get_cleaned_username(),
177
208
  # Tailscale credentials
178
209
  'tailscale_secret': tailscale_secret,
210
+ # SSH
211
+ 'enable_ssh': enable_ssh,
212
+ 'secret_name': secret_name,
179
213
  },
180
214
  temp.name,
181
215
  )
@@ -183,6 +217,13 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
183
217
  # merge with `~/.konduktor/config.yaml``
184
218
  kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
185
219
  pod_config = common_utils.read_yaml(temp.name)
220
+
221
+ for env_var in pod_config['kubernetes']['pod_config']['spec']['containers'][0][
222
+ 'env'
223
+ ]:
224
+ if env_var['name'] in task.envs:
225
+ env_var['value'] = task.envs.pop(env_var['name'])
226
+
186
227
  for k, v in task.envs.items():
187
228
  pod_config['kubernetes']['pod_config']['spec']['containers'][0][
188
229
  'env'
@@ -221,6 +262,7 @@ def create_jobset(
221
262
  'user': common_utils.get_cleaned_username(),
222
263
  'accelerator_type': accelerator_type,
223
264
  'num_accelerators': num_accelerators,
265
+ 'completions': task.resources.get_completions(),
224
266
  **_JOBSET_METADATA_LABELS,
225
267
  },
226
268
  temp.name,
@@ -105,7 +105,7 @@ def _make_task_with_overrides(
105
105
  env: Optional[List[Tuple[str, str]]] = None,
106
106
  field_to_ignore: Optional[List[str]] = None,
107
107
  ) -> konduktor.Task:
108
- """Creates a task or a dag from an entrypoint with overrides.
108
+ """Creates a task from an entrypoint with overrides.
109
109
 
110
110
  Returns:
111
111
  konduktor.Task
@@ -271,8 +271,7 @@ _EXTRA_RESOURCES_OPTIONS = [
271
271
  type=str,
272
272
  help=(
273
273
  'Type and number of GPUs to use. Example values: '
274
- '"V100:8", "V100" (short for a count of 1), or "V100:0.5" '
275
- '(fractional counts are supported by the scheduling framework). '
274
+ '"V100:8", "V100" (short for a count of 1)'
276
275
  'If a new cluster is being launched by this command, this is the '
277
276
  'resources to provision. If an existing cluster is being reused, this'
278
277
  " is seen as the task demand, which must fit the cluster's total "
@@ -1037,8 +1037,11 @@ class S3Store(storage_utils.AbstractStore):
1037
1037
  secret_name=cls._AWS_SECRET_NAME,
1038
1038
  namespace=namespace,
1039
1039
  context=context,
1040
- secret_key=cls._AWS_CREDENTIALS_KEY,
1041
- secret_value=base64_utils.zip_base64encode(credentials_files),
1040
+ data={
1041
+ cls._AWS_CREDENTIALS_KEY: base64_utils.zip_base64encode(
1042
+ credentials_files
1043
+ )
1044
+ },
1042
1045
  )
1043
1046
  if not ok:
1044
1047
  logger.error(f'Failed to set AWS credentials in k8s secret: \n{result}')
@@ -219,10 +219,10 @@ def get_gsutil_command() -> Tuple[str, str]:
219
219
  cmd_to_run = f'{alias_gen}; {gsutil_alias} cp ...'
220
220
  ```
221
221
  """
222
- gsutil_alias = 'skypilot_gsutil'
222
+ gsutil_alias = 'konduktor_gsutil'
223
223
  disable_multiprocessing_flag = '-o "GSUtil:parallel_process_count=1"'
224
224
 
225
- # Define skypilot_gsutil as a shell function instead of an alias.
225
+ # Define konduktor_gsutil as a shell function instead of an alias.
226
226
  # This function will behave just like alias, but can be called immediately
227
227
  # after its definition on the same line
228
228
  alias_gen = (
@@ -891,8 +891,11 @@ class GcsStore(storage_utils.AbstractStore):
891
891
  secret_name=cls._GCP_SECRET_NAME,
892
892
  namespace=namespace,
893
893
  context=context,
894
- secret_key=cls._GCP_CREDENTIALS_KEY,
895
- secret_value=base64_utils.zip_base64encode(credentials_files),
894
+ data={
895
+ cls._GCP_CREDENTIALS_KEY: base64_utils.zip_base64encode(
896
+ credentials_files
897
+ )
898
+ },
896
899
  )
897
900
  if not ok:
898
901
  logger.error(f'Failed to set GCP credentials in k8s secret: \n{result}')
@@ -271,15 +271,14 @@ class Storage(object):
271
271
  Can be a single local path, a list of local paths, or a cloud URI
272
272
  (s3://, gs://, etc.). Local paths do not need to be absolute.
273
273
  stores: Optional; Specify pre-initialized stores (S3Store, GcsStore).
274
- persistent: bool; Whether to persist across sky launches.
274
+ persistent: bool; Whether to persist across konduktor launches.
275
275
  mode: StorageMode; Specify how the storage object is manifested on
276
276
  the remote VM. Can be either MOUNT or COPY. Defaults to MOUNT.
277
- sync_on_reconstruction: bool; Whether to sync the data if the storage
278
- object is found in the global_user_state and reconstructed from
279
- there. This is set to false when the Storage object is created not
280
- for direct use, e.g. for 'sky storage delete', or the storage is
281
- being re-used, e.g., for `sky start` on a stopped cluster.
282
- _is_sky_managed: Optional[bool]; Indicates if the storage is managed
277
+ sync_on_reconstruction: bool; [defunct] Whether to sync the
278
+ data if the storage object is found in the global_user_state
279
+ and reconstructed from there. This is set to
280
+ false when the Storage object is created not for direct use
281
+ _is_sky_managed: Optional[bool]; [defunct] Indicates if the storage is managed
283
282
  by Sky. Without this argument, the controller's behavior differs
284
283
  from the local machine. For example, if a bucket does not exist:
285
284
  Local Machine (is_sky_managed=True) →
@@ -149,10 +149,10 @@ def maybe_translate_local_file_mounts_and_sync_up(
149
149
  msg = 'workdir'
150
150
  if msg:
151
151
  logger.info(
152
- ux_utils.starting_message(f'Translating {msg} to ' 'SkyPilot Storage...')
152
+ ux_utils.starting_message(f'Translating {msg} to ' 'cloud Storage...')
153
153
  )
154
154
  rich_utils.force_update_status(
155
- ux_utils.spinner_message(f'Translating {msg} to SkyPilot Storage...')
155
+ ux_utils.spinner_message(f'Translating {msg} to cloud Storage...')
156
156
  )
157
157
 
158
158
  # Get the bucket name for the workdir and file mounts,
@@ -63,8 +63,6 @@ def _load_config(context: Optional[str] = None):
63
63
  err_str = (
64
64
  f'Failed to load Kubernetes configuration for {context!r}. '
65
65
  'Kubeconfig does not contain any valid context(s).\n'
66
- ' If you were running a local Kubernetes '
67
- 'cluster, run `sky local up` to start the cluster.'
68
66
  )
69
67
  else:
70
68
  err_str = (
@@ -72,7 +70,6 @@ def _load_config(context: Optional[str] = None):
72
70
  'Please check if your kubeconfig file exists at '
73
71
  f'~/.kube/config and is valid.'
74
72
  )
75
- err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
76
73
  with ux_utils.print_exception_no_traceback():
77
74
  raise ValueError(err_str) from None
78
75
 
@@ -49,6 +49,7 @@ class Resources:
49
49
  image_id: Union[str, None] = None,
50
50
  disk_size: Optional[int] = None,
51
51
  labels: Optional[Dict[str, str]] = None,
52
+ job_config: Optional[Dict[str, Union[int, str]]] = None,
52
53
  # Internal use only.
53
54
  # pylint: disable=invalid-name
54
55
  _cluster_config_overrides: Optional[Dict[str, Any]] = None,
@@ -91,6 +92,7 @@ class Resources:
91
92
  instance tags. On GCP, labels map to instance labels. On
92
93
  Kubernetes, labels map to pod labels. On other clouds, labels are
93
94
  not supported and will be ignored.
95
+ job_config: the configuration of the job spec
94
96
  Raises:
95
97
  ValueError: if some attributes are invalid.
96
98
  exceptions.NoCloudAccessError: if no public cloud is enabled.
@@ -122,6 +124,7 @@ class Resources:
122
124
  self._set_cpus(cpus)
123
125
  self._set_memory(memory)
124
126
  self._set_accelerators(accelerators)
127
+ self.job_config = job_config
125
128
 
126
129
  # TODO: move these out of init to prevent repeated calls.
127
130
  self._try_validate_cpus_mem()
@@ -382,6 +385,11 @@ class Resources:
382
385
  accel_str = f'{accel_name}:{accel_count}'
383
386
  return accel_str
384
387
 
388
+ def get_completions(self) -> Optional[int]:
389
+ if self.job_config and self.job_config['completions']:
390
+ return int(self.job_config['completions'])
391
+ return None
392
+
385
393
  def copy(self, **override) -> 'Resources':
386
394
  """Returns a copy of the given Resources."""
387
395
  resources = Resources(
@@ -392,6 +400,7 @@ class Resources:
392
400
  disk_size=override.pop('disk_size', self.disk_size),
393
401
  image_id=override.pop('image_id', self.image_id),
394
402
  labels=override.pop('labels', self.labels),
403
+ job_config=override.pop('job_config', self.job_config),
395
404
  )
396
405
  assert len(override) == 0
397
406
  return resources
@@ -404,6 +413,13 @@ class Resources:
404
413
  config, schemas.get_resources_schema(), 'Invalid resources YAML: '
405
414
  )
406
415
 
416
+ if config.get('job_config', None):
417
+ common_utils.validate_schema(
418
+ config['job_config'],
419
+ schemas.get_job_schema(),
420
+ 'Invalid job config YAML',
421
+ )
422
+
407
423
  def _override_resources(
408
424
  base_resource_config: Dict[str, Any], override_configs: List[Dict[str, Any]]
409
425
  ) -> List[Resources]:
@@ -446,6 +462,7 @@ class Resources:
446
462
  resources_fields['disk_size'] = config.pop('disk_size', None)
447
463
  resources_fields['image_id'] = config.pop('image_id', None)
448
464
  resources_fields['labels'] = config.pop('labels', None)
465
+ resources_fields['job_config'] = config.pop('job_config', None)
449
466
 
450
467
  if resources_fields['cpus'] is not None:
451
468
  resources_fields['cpus'] = str(resources_fields['cpus'])
@@ -475,4 +492,5 @@ class Resources:
475
492
  add_if_not_none('disk_size', self.disk_size)
476
493
  add_if_not_none('image_id', self.image_id)
477
494
  add_if_not_none('labels', self.labels)
495
+ add_if_not_none('job_config', self.job_config)
478
496
  return config
@@ -181,8 +181,7 @@ class Task:
181
181
  """
182
182
  assert name is not None, 'Task name is required'
183
183
  self.name = name
184
- if setup is not None:
185
- raise ValueError('`setup` is being deprecated and not supported')
184
+ self.setup = setup
186
185
  self.run = run
187
186
  self.storage_mounts: Dict[str, storage_lib.Storage] = {}
188
187
  self.storage_plans: Dict[storage_lib.Storage, storage_lib.StoreType] = {}
@@ -320,6 +319,7 @@ class Task:
320
319
 
321
320
  task = Task(
322
321
  config.pop('name', None),
322
+ setup=config.pop('setup', None),
323
323
  run=config.pop('run', None),
324
324
  workdir=config.pop('workdir', None),
325
325
  num_nodes=config.pop('num_nodes', None),
@@ -695,7 +695,7 @@ class Task:
695
695
  This should be called before provisioning in order to take effect.
696
696
 
697
697
  Args:
698
- storage_mounts: an optional dict of ``{mount_path: sky.Storage
698
+ storage_mounts: an optional dict of ``{mount_path: konduktor.data.Storage
699
699
  object}``, where mount_path is the path inside the remote VM(s)
700
700
  where the Storage object will be mounted on.
701
701