konduktor-nightly 0.1.0.dev20250513105010__py3-none-any.whl → 0.1.0.dev20250515104942__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
konduktor/__init__.py CHANGED
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = '2b0d682b6fc8ff0d4e5ea417c4e324090f3c5f9b'
17
+ _KONDUKTOR_COMMIT_SHA = 'c0bd8e8774fab8042721b43a8cb8c35a624f8299'
18
18
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
19
19
 
20
20
 
@@ -48,5 +48,5 @@ def _get_git_commit():
48
48
 
49
49
 
50
50
  __commit__ = _get_git_commit()
51
- __version__ = '1.0.0.dev0.1.0.dev20250513105010'
51
+ __version__ = '1.0.0.dev0.1.0.dev20250515104942'
52
52
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -26,8 +26,7 @@ class LazyImport:
26
26
 
27
27
  We use this for pandas and networkx, as they can be time-consuming to import
28
28
  (0.1-0.2 seconds). With this class, we can avoid the unnecessary import time
29
- when the module is not used (e.g., `networkx` should not be imported for
30
- `sky status and `pandas` should not be imported for `sky exec`).
29
+ when the module is not used.
31
30
 
32
31
  We also use this for cloud adaptors, because we do not want to import the
33
32
  cloud dependencies when it is not enabled.
@@ -0,0 +1,124 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """
14
+ The local machine's public key should not be uploaded to the remote VM, because
15
+ it will cause private/public key pair mismatch when the user tries to launch new
16
+ VM from that remote VM using SkyPilot, e.g., the node is used as a jobs
17
+ controller. (Lambda cloud is an exception, due to the limitation of the cloud
18
+ provider. See the comments in setup_lambda_authentication)
19
+ """
20
+
21
+ import functools
22
+ import os
23
+ from typing import Tuple
24
+
25
+ import filelock
26
+
27
+ from konduktor import logging
28
+ from konduktor.utils import common_utils
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+ _SSH_KEY_PATH_PREFIX = '~/.konduktor/clients/{user_hash}/ssh'
33
+
34
+ MAX_TRIALS = 64
35
+
36
+
37
+ def get_ssh_key_and_lock_path() -> Tuple[str, str, str]:
38
+ user_hash = common_utils.get_user_hash()
39
+ user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
40
+ os.makedirs(os.path.expanduser(user_ssh_key_prefix), exist_ok=True, mode=0o700)
41
+ private_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key')
42
+ public_key_path = os.path.join(user_ssh_key_prefix, 'konduktor-key.pub')
43
+ lock_path = os.path.join(user_ssh_key_prefix, '.__internal-konduktor-key.lock')
44
+ return private_key_path, public_key_path, lock_path
45
+
46
+
47
+ def _generate_rsa_key_pair() -> Tuple[str, str]:
48
+ # Keep the import of the cryptography local to avoid expensive
49
+ # third-party imports when not needed.
50
+ # pylint: disable=import-outside-toplevel
51
+ from cryptography.hazmat.backends import default_backend
52
+ from cryptography.hazmat.primitives import serialization
53
+ from cryptography.hazmat.primitives.asymmetric import rsa
54
+
55
+ key = rsa.generate_private_key(
56
+ backend=default_backend(), public_exponent=65537, key_size=2048
57
+ )
58
+
59
+ private_key = (
60
+ key.private_bytes(
61
+ encoding=serialization.Encoding.PEM,
62
+ format=serialization.PrivateFormat.TraditionalOpenSSL,
63
+ encryption_algorithm=serialization.NoEncryption(),
64
+ )
65
+ .decode('utf-8')
66
+ .strip()
67
+ )
68
+
69
+ public_key = (
70
+ key.public_key()
71
+ .public_bytes(
72
+ serialization.Encoding.OpenSSH, serialization.PublicFormat.OpenSSH
73
+ )
74
+ .decode('utf-8')
75
+ .strip()
76
+ )
77
+
78
+ return public_key, private_key
79
+
80
+
81
+ def _save_key_pair(
82
+ private_key_path: str, public_key_path: str, private_key: str, public_key: str
83
+ ) -> None:
84
+ key_dir = os.path.dirname(private_key_path)
85
+ os.makedirs(key_dir, exist_ok=True, mode=0o700)
86
+
87
+ with open(
88
+ private_key_path,
89
+ 'w',
90
+ encoding='utf-8',
91
+ opener=functools.partial(os.open, mode=0o600),
92
+ ) as f:
93
+ f.write(private_key)
94
+
95
+ with open(
96
+ public_key_path,
97
+ 'w',
98
+ encoding='utf-8',
99
+ opener=functools.partial(os.open, mode=0o644),
100
+ ) as f:
101
+ f.write(public_key)
102
+
103
+
104
+ def get_or_generate_keys() -> Tuple[str, str]:
105
+ """Returns the aboslute private and public key paths."""
106
+ private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path()
107
+ private_key_path = os.path.expanduser(private_key_path)
108
+ public_key_path = os.path.expanduser(public_key_path)
109
+ lock_path = os.path.expanduser(lock_path)
110
+
111
+ lock_dir = os.path.dirname(lock_path)
112
+ # We should have the folder ~/.konduktor/generated/ssh to have 0o700 permission,
113
+ # as the ssh configs will be written to this folder as well in
114
+ # backend_utils.SSHConfigHelper
115
+ os.makedirs(lock_dir, exist_ok=True, mode=0o700)
116
+ with filelock.FileLock(lock_path, timeout=10):
117
+ if not os.path.exists(private_key_path):
118
+ public_key, private_key = _generate_rsa_key_pair()
119
+ _save_key_pair(private_key_path, public_key_path, private_key, public_key)
120
+ assert os.path.exists(public_key_path), (
121
+ 'Private key found, but associated public key '
122
+ f'{public_key_path} does not exist.'
123
+ )
124
+ return private_key_path, public_key_path
@@ -70,25 +70,26 @@ def _wait_for_jobset_start(namespace: str, job_name: str):
70
70
  assert jobsets is not None, (
71
71
  f'Jobset {job_name} ' f'not found in namespace {namespace}'
72
72
  )
73
- if jobsets['status']['replicatedJobsStatus'][0]['ready']:
74
- logger.info(
75
- f'task '
76
- f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
77
- f'{colorama.Style.RESET_ALL} ready'
78
- )
79
- break
80
- elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
81
- return
82
- elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
83
- logger.info(
84
- f'job '
85
- f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
86
- f'{colorama.Style.RESET_ALL} '
87
- f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
88
- )
89
- job = jobset_utils.get_job(namespace, job_name)
90
- _raise_job_error(job)
91
- return
73
+ if 'status' in jobsets:
74
+ if jobsets['status']['replicatedJobsStatus'][0]['ready']:
75
+ logger.info(
76
+ f'task '
77
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
78
+ f'{colorama.Style.RESET_ALL} ready'
79
+ )
80
+ break
81
+ elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
82
+ return
83
+ elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
84
+ logger.info(
85
+ f'job '
86
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
87
+ f'{colorama.Style.RESET_ALL} '
88
+ f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
89
+ )
90
+ job = jobset_utils.get_job(namespace, job_name)
91
+ _raise_job_error(job)
92
+ return
92
93
  if timeout != -1 and time.time() - start > timeout:
93
94
  logger.error(
94
95
  f'{colorama.Style.BRIGHT}'
@@ -1,5 +1,6 @@
1
1
  """Jobset utils: wraps CRUD operations for jobsets"""
2
2
 
3
+ import base64
3
4
  import enum
4
5
  import json
5
6
  import os
@@ -15,7 +16,7 @@ if typing.TYPE_CHECKING:
15
16
  from datetime import timedelta
16
17
 
17
18
  import konduktor
18
- from konduktor import config, constants, kube_client, logging
19
+ from konduktor import authentication, config, constants, kube_client, logging
19
20
  from konduktor.data import registry
20
21
  from konduktor.utils import (
21
22
  common_utils,
@@ -93,6 +94,10 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
93
94
  else:
94
95
  accelerator_type = None
95
96
 
97
+ assert task.resources.cpus is not None, 'Task resources cpus are required'
98
+ assert task.resources.memory is not None, 'Task resources memory are required'
99
+ assert task.resources.image_id is not None, 'Task resources image_id are required'
100
+
96
101
  # template the commands to run on the container for syncing files. At this point
97
102
  # task.stores is Dict[str, storage_utils.Storage] which is (dst, storage_obj_src)
98
103
  # first we iterate through storage_mounts and then file_mounts.
@@ -150,10 +155,35 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
150
155
  f'though specified by `tailscale.secret_name`: {err}'
151
156
  )
152
157
 
153
- assert task.resources is not None, 'Task resources are required'
154
- assert task.resources.cpus is not None, 'Task resources cpus are required'
155
- assert task.resources.memory is not None, 'Task resources memory are required'
156
- assert task.resources.image_id is not None, 'Task resources image_id are required'
158
+ enable_ssh = config.get_nested(('ssh', 'enable'), False)
159
+ secret_name = None
160
+ if enable_ssh:
161
+ private_key_path, public_key_path = authentication.get_or_generate_keys()
162
+ with (
163
+ open(private_key_path, 'rb') as private_key_file,
164
+ open(public_key_path, 'rb') as public_key_file,
165
+ ):
166
+ private_key, public_key = private_key_file.read(), public_key_file.read()
167
+ user_hash = common_utils.get_user_hash()
168
+ context = kubernetes_utils.get_current_kube_config_context_name()
169
+ namespace = kubernetes_utils.get_kube_config_context_namespace(
170
+ context_name=context
171
+ )
172
+ secret_name = f'konduktor-ssh-keys-{user_hash}'
173
+ ok, result = kubernetes_utils.set_secret(
174
+ secret_name=secret_name,
175
+ namespace=namespace,
176
+ context=context,
177
+ data={
178
+ 'PUBKEY': base64.b64encode(public_key).decode(),
179
+ 'PRIVKEY': base64.b64encode(private_key).decode(),
180
+ },
181
+ )
182
+ if not ok:
183
+ raise exceptions.CreateSecretError(
184
+ f'Failed to set k8s secret {secret_name}: \n{result}'
185
+ )
186
+
157
187
  with tempfile.NamedTemporaryFile() as temp:
158
188
  common_utils.fill_template(
159
189
  'pod.yaml.j2',
@@ -166,6 +196,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
166
196
  'master_addr': master_addr,
167
197
  'num_nodes': task.num_nodes,
168
198
  'job_name': task.name, # append timestamp and user id here?
199
+ 'setup_cmd': task.setup or '',
169
200
  'run_cmd': task.run,
170
201
  'node_hostnames': node_hostnames,
171
202
  'accelerator_type': accelerator_type,
@@ -176,6 +207,9 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
176
207
  'user': common_utils.get_cleaned_username(),
177
208
  # Tailscale credentials
178
209
  'tailscale_secret': tailscale_secret,
210
+ # SSH
211
+ 'enable_ssh': enable_ssh,
212
+ 'secret_name': secret_name,
179
213
  },
180
214
  temp.name,
181
215
  )
@@ -183,6 +217,13 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
183
217
  # merge with `~/.konduktor/config.yaml``
184
218
  kubernetes_utils.combine_pod_config_fields(temp.name, pod_config)
185
219
  pod_config = common_utils.read_yaml(temp.name)
220
+
221
+ for env_var in pod_config['kubernetes']['pod_config']['spec']['containers'][0][
222
+ 'env'
223
+ ]:
224
+ if env_var['name'] in task.envs:
225
+ env_var['value'] = task.envs.pop(env_var['name'])
226
+
186
227
  for k, v in task.envs.items():
187
228
  pod_config['kubernetes']['pod_config']['spec']['containers'][0][
188
229
  'env'
@@ -221,6 +262,7 @@ def create_jobset(
221
262
  'user': common_utils.get_cleaned_username(),
222
263
  'accelerator_type': accelerator_type,
223
264
  'num_accelerators': num_accelerators,
265
+ 'completions': task.resources.get_completions(),
224
266
  **_JOBSET_METADATA_LABELS,
225
267
  },
226
268
  temp.name,
konduktor/cli.py CHANGED
@@ -105,7 +105,7 @@ def _make_task_with_overrides(
105
105
  env: Optional[List[Tuple[str, str]]] = None,
106
106
  field_to_ignore: Optional[List[str]] = None,
107
107
  ) -> konduktor.Task:
108
- """Creates a task or a dag from an entrypoint with overrides.
108
+ """Creates a task from an entrypoint with overrides.
109
109
 
110
110
  Returns:
111
111
  konduktor.Task
@@ -271,8 +271,7 @@ _EXTRA_RESOURCES_OPTIONS = [
271
271
  type=str,
272
272
  help=(
273
273
  'Type and number of GPUs to use. Example values: '
274
- '"V100:8", "V100" (short for a count of 1), or "V100:0.5" '
275
- '(fractional counts are supported by the scheduling framework). '
274
+ '"V100:8", "V100" (short for a count of 1)'
276
275
  'If a new cluster is being launched by this command, this is the '
277
276
  'resources to provision. If an existing cluster is being reused, this'
278
277
  " is seen as the task demand, which must fit the cluster's total "
konduktor/data/aws/s3.py CHANGED
@@ -1037,8 +1037,11 @@ class S3Store(storage_utils.AbstractStore):
1037
1037
  secret_name=cls._AWS_SECRET_NAME,
1038
1038
  namespace=namespace,
1039
1039
  context=context,
1040
- secret_key=cls._AWS_CREDENTIALS_KEY,
1041
- secret_value=base64_utils.zip_base64encode(credentials_files),
1040
+ data={
1041
+ cls._AWS_CREDENTIALS_KEY: base64_utils.zip_base64encode(
1042
+ credentials_files
1043
+ )
1044
+ },
1042
1045
  )
1043
1046
  if not ok:
1044
1047
  logger.error(f'Failed to set AWS credentials in k8s secret: \n{result}')
@@ -219,10 +219,10 @@ def get_gsutil_command() -> Tuple[str, str]:
219
219
  cmd_to_run = f'{alias_gen}; {gsutil_alias} cp ...'
220
220
  ```
221
221
  """
222
- gsutil_alias = 'skypilot_gsutil'
222
+ gsutil_alias = 'konduktor_gsutil'
223
223
  disable_multiprocessing_flag = '-o "GSUtil:parallel_process_count=1"'
224
224
 
225
- # Define skypilot_gsutil as a shell function instead of an alias.
225
+ # Define konduktor_gsutil as a shell function instead of an alias.
226
226
  # This function will behave just like alias, but can be called immediately
227
227
  # after its definition on the same line
228
228
  alias_gen = (
konduktor/data/gcp/gcs.py CHANGED
@@ -891,8 +891,11 @@ class GcsStore(storage_utils.AbstractStore):
891
891
  secret_name=cls._GCP_SECRET_NAME,
892
892
  namespace=namespace,
893
893
  context=context,
894
- secret_key=cls._GCP_CREDENTIALS_KEY,
895
- secret_value=base64_utils.zip_base64encode(credentials_files),
894
+ data={
895
+ cls._GCP_CREDENTIALS_KEY: base64_utils.zip_base64encode(
896
+ credentials_files
897
+ )
898
+ },
896
899
  )
897
900
  if not ok:
898
901
  logger.error(f'Failed to set GCP credentials in k8s secret: \n{result}')
konduktor/data/storage.py CHANGED
@@ -271,15 +271,14 @@ class Storage(object):
271
271
  Can be a single local path, a list of local paths, or a cloud URI
272
272
  (s3://, gs://, etc.). Local paths do not need to be absolute.
273
273
  stores: Optional; Specify pre-initialized stores (S3Store, GcsStore).
274
- persistent: bool; Whether to persist across sky launches.
274
+ persistent: bool; Whether to persist across konduktor launches.
275
275
  mode: StorageMode; Specify how the storage object is manifested on
276
276
  the remote VM. Can be either MOUNT or COPY. Defaults to MOUNT.
277
- sync_on_reconstruction: bool; Whether to sync the data if the storage
278
- object is found in the global_user_state and reconstructed from
279
- there. This is set to false when the Storage object is created not
280
- for direct use, e.g. for 'sky storage delete', or the storage is
281
- being re-used, e.g., for `sky start` on a stopped cluster.
282
- _is_sky_managed: Optional[bool]; Indicates if the storage is managed
277
+ sync_on_reconstruction: bool; [defunct] Whether to sync the
278
+ data if the storage object is found in the global_user_state
279
+ and reconstructed from there. This is set to
280
+ false when the Storage object is created not for direct use
281
+ _is_sky_managed: Optional[bool]; [defunct] Indicates if the storage is managed
283
282
  by Sky. Without this argument, the controller's behavior differs
284
283
  from the local machine. For example, if a bucket does not exist:
285
284
  Local Machine (is_sky_managed=True) →
konduktor/execution.py CHANGED
@@ -149,10 +149,10 @@ def maybe_translate_local_file_mounts_and_sync_up(
149
149
  msg = 'workdir'
150
150
  if msg:
151
151
  logger.info(
152
- ux_utils.starting_message(f'Translating {msg} to ' 'SkyPilot Storage...')
152
+ ux_utils.starting_message(f'Translating {msg} to ' 'cloud Storage...')
153
153
  )
154
154
  rich_utils.force_update_status(
155
- ux_utils.spinner_message(f'Translating {msg} to SkyPilot Storage...')
155
+ ux_utils.spinner_message(f'Translating {msg} to cloud Storage...')
156
156
  )
157
157
 
158
158
  # Get the bucket name for the workdir and file mounts,
konduktor/kube_client.py CHANGED
@@ -63,8 +63,6 @@ def _load_config(context: Optional[str] = None):
63
63
  err_str = (
64
64
  f'Failed to load Kubernetes configuration for {context!r}. '
65
65
  'Kubeconfig does not contain any valid context(s).\n'
66
- ' If you were running a local Kubernetes '
67
- 'cluster, run `sky local up` to start the cluster.'
68
66
  )
69
67
  else:
70
68
  err_str = (
@@ -72,7 +70,6 @@ def _load_config(context: Optional[str] = None):
72
70
  'Please check if your kubeconfig file exists at '
73
71
  f'~/.kube/config and is valid.'
74
72
  )
75
- err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
76
73
  with ux_utils.print_exception_no_traceback():
77
74
  raise ValueError(err_str) from None
78
75
 
konduktor/resource.py CHANGED
@@ -49,6 +49,7 @@ class Resources:
49
49
  image_id: Union[str, None] = None,
50
50
  disk_size: Optional[int] = None,
51
51
  labels: Optional[Dict[str, str]] = None,
52
+ job_config: Optional[Dict[str, Union[int, str]]] = None,
52
53
  # Internal use only.
53
54
  # pylint: disable=invalid-name
54
55
  _cluster_config_overrides: Optional[Dict[str, Any]] = None,
@@ -91,6 +92,7 @@ class Resources:
91
92
  instance tags. On GCP, labels map to instance labels. On
92
93
  Kubernetes, labels map to pod labels. On other clouds, labels are
93
94
  not supported and will be ignored.
95
+ job_config: the configuration of the job spec
94
96
  Raises:
95
97
  ValueError: if some attributes are invalid.
96
98
  exceptions.NoCloudAccessError: if no public cloud is enabled.
@@ -122,6 +124,7 @@ class Resources:
122
124
  self._set_cpus(cpus)
123
125
  self._set_memory(memory)
124
126
  self._set_accelerators(accelerators)
127
+ self.job_config = job_config
125
128
 
126
129
  # TODO: move these out of init to prevent repeated calls.
127
130
  self._try_validate_cpus_mem()
@@ -382,6 +385,11 @@ class Resources:
382
385
  accel_str = f'{accel_name}:{accel_count}'
383
386
  return accel_str
384
387
 
388
+ def get_completions(self) -> Optional[int]:
389
+ if self.job_config and self.job_config['completions']:
390
+ return int(self.job_config['completions'])
391
+ return None
392
+
385
393
  def copy(self, **override) -> 'Resources':
386
394
  """Returns a copy of the given Resources."""
387
395
  resources = Resources(
@@ -392,6 +400,7 @@ class Resources:
392
400
  disk_size=override.pop('disk_size', self.disk_size),
393
401
  image_id=override.pop('image_id', self.image_id),
394
402
  labels=override.pop('labels', self.labels),
403
+ job_config=override.pop('job_config', self.job_config),
395
404
  )
396
405
  assert len(override) == 0
397
406
  return resources
@@ -404,6 +413,13 @@ class Resources:
404
413
  config, schemas.get_resources_schema(), 'Invalid resources YAML: '
405
414
  )
406
415
 
416
+ if config.get('job_config', None):
417
+ common_utils.validate_schema(
418
+ config['job_config'],
419
+ schemas.get_job_schema(),
420
+ 'Invalid job config YAML',
421
+ )
422
+
407
423
  def _override_resources(
408
424
  base_resource_config: Dict[str, Any], override_configs: List[Dict[str, Any]]
409
425
  ) -> List[Resources]:
@@ -446,6 +462,7 @@ class Resources:
446
462
  resources_fields['disk_size'] = config.pop('disk_size', None)
447
463
  resources_fields['image_id'] = config.pop('image_id', None)
448
464
  resources_fields['labels'] = config.pop('labels', None)
465
+ resources_fields['job_config'] = config.pop('job_config', None)
449
466
 
450
467
  if resources_fields['cpus'] is not None:
451
468
  resources_fields['cpus'] = str(resources_fields['cpus'])
@@ -475,4 +492,5 @@ class Resources:
475
492
  add_if_not_none('disk_size', self.disk_size)
476
493
  add_if_not_none('image_id', self.image_id)
477
494
  add_if_not_none('labels', self.labels)
495
+ add_if_not_none('job_config', self.job_config)
478
496
  return config
konduktor/task.py CHANGED
@@ -181,8 +181,7 @@ class Task:
181
181
  """
182
182
  assert name is not None, 'Task name is required'
183
183
  self.name = name
184
- if setup is not None:
185
- raise ValueError('`setup` is being deprecated and not supported')
184
+ self.setup = setup
186
185
  self.run = run
187
186
  self.storage_mounts: Dict[str, storage_lib.Storage] = {}
188
187
  self.storage_plans: Dict[storage_lib.Storage, storage_lib.StoreType] = {}
@@ -320,6 +319,7 @@ class Task:
320
319
 
321
320
  task = Task(
322
321
  config.pop('name', None),
322
+ setup=config.pop('setup', None),
323
323
  run=config.pop('run', None),
324
324
  workdir=config.pop('workdir', None),
325
325
  num_nodes=config.pop('num_nodes', None),
@@ -695,7 +695,7 @@ class Task:
695
695
  This should be called before provisioning in order to take effect.
696
696
 
697
697
  Args:
698
- storage_mounts: an optional dict of ``{mount_path: sky.Storage
698
+ storage_mounts: an optional dict of ``{mount_path: konduktor.data.Storage
699
699
  object}``, where mount_path is the path inside the remote VM(s)
700
700
  where the Storage object will be mounted on.
701
701
 
@@ -1,12 +1,12 @@
1
1
  kubernetes:
2
2
  pod_config:
3
3
  metadata:
4
- {% if accelerator_type %}
5
4
  labels:
6
5
  parent: trainy
7
- trainy.ai/accelerator: {{ accelerator_type }}
8
6
  trainy.ai/username: {{ user }}
9
- {% endif %}
7
+ {% if accelerator_type %}
8
+ trainy.ai/accelerator: {{ accelerator_type }}
9
+ {% endif %}
10
10
  spec:
11
11
  restartPolicy: "Never"
12
12
  # trigger this on GPU request
@@ -15,9 +15,39 @@ kubernetes:
15
15
  - key: "nvidia.com/gpu"
16
16
  operator: "Exists"
17
17
  {% endif %}
18
+ initContainers:
19
+ - name: setup-synchronizer
20
+ image: "alpine:3.19"
21
+ restartPolicy: Always
22
+ command: ["/bin/sh", "-c"]
23
+ args:
24
+ - |
25
+ apk add --no-cache socat
26
+ wget https://raw.githubusercontent.com/asaiacai/dumb_barrier/refs/heads/main/dumb_barrier.sh
27
+ sh -x dumb_barrier.sh
28
+ volumeMounts:
29
+ - name: sync
30
+ mountPath: /tmp/konduktor
31
+ env:
32
+ - name: MASTER_ADDR
33
+ value: "{{ master_addr }}"
34
+ - name: RANK
35
+ valueFrom:
36
+ fieldRef:
37
+ fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
38
+ - name: WORLD_SIZE
39
+ value: "{{ num_nodes }}"
40
+ - name: MASTER_PORT
41
+ value: "11111"
42
+ - name: GO_PORT
43
+ value: "11112"
18
44
  containers:
19
45
  # TODO(asaiacai): should decide here whether we add the fabric interfaces/containers init etc.
20
46
  - name: konduktor-container
47
+ {% if enable_ssh %}
48
+ ports:
49
+ - containerPort: 2222
50
+ {% endif %}
21
51
  image: {{ image_id }}
22
52
  # this is set during jobset definition since we need to know the jobset
23
53
  # name and number of nodes to set all the environment variables correctly here
@@ -56,6 +86,18 @@ kubernetes:
56
86
  fieldRef:
57
87
  fieldPath: metadata.uid
58
88
  {% endif %}
89
+ {% if enable_ssh %}
90
+ - name: KONDUKTOR_SSHPUB
91
+ valueFrom:
92
+ secretKeyRef:
93
+ name: {{ secret_name }}
94
+ key: PUBKEY
95
+ - name: KONDUKTOR_SSHPRIV
96
+ valueFrom:
97
+ secretKeyRef:
98
+ name: {{ secret_name }}
99
+ key: PRIVKEY
100
+ {% endif %}
59
101
  # these are for compatibility with skypilot
60
102
  - name: SKYPILOT_NODE_IPS
61
103
  value: "{{ node_hostnames }}"
@@ -70,6 +112,8 @@ kubernetes:
70
112
  volumeMounts:
71
113
  - name: shared-memory
72
114
  mountPath: /dev/shm
115
+ - name: sync
116
+ mountPath: /tmp/konduktor
73
117
  {% for secret_type, secret_name in mount_secrets.items() %}
74
118
  - name: {{ secret_type }}-secret
75
119
  mountPath: /run/konduktor/{{ secret_type }}-secret
@@ -89,22 +133,22 @@ kubernetes:
89
133
 
90
134
 
91
135
  PACKAGES="";
92
- {% if 'rsync' in run_cmd %}
136
+ {% if 'rsync' in run_cmd or 'rsync' in setup_cmd %}
93
137
  PACKAGES="$PACKAGES rsync";
94
138
  {% endif %}
95
- {% if 'curl' in run_cmd or tailscale_secret %}
139
+ {% if 'curl' in run_cmd or 'curl' in setup_cmd or tailscale_secret %}
96
140
  PACKAGES="$PACKAGES curl";
97
141
  {% endif %}
98
142
  {% if 'gs' in mount_secrets or 's3' in mount_secrets %}
99
143
  PACKAGES="$PACKAGES unzip wget";
100
144
  {% endif %}
101
- {% if 'git' in run_cmd %}
145
+ {% if 'git' in run_cmd or 'git' in setup_cmd %}
102
146
  PACKAGES="$PACKAGES git";
103
147
  {% endif %}
104
148
 
105
149
  if [ ! -z "${PACKAGES}" ]; then
106
150
  # Run apt update, install missing packages
107
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > ~/.konduktor/tmp/apt-update.log 2>&1 || \
151
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update 2>&1 | tee -a ~/.konduktor/tmp/apt-update.log 2>&1 || \
108
152
  $(prefix_cmd) echo "Warning: apt-get update failed. Continuing anyway..." >> ~/.konduktor/tmp/apt-update.log
109
153
  fi
110
154
 
@@ -125,22 +169,114 @@ kubernetes:
125
169
  done;
126
170
  if [ ! -z "$INSTALL_FIRST" ]; then
127
171
  $(prefix_cmd) echo "Installing core packages: $INSTALL_FIRST";
128
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $INSTALL_FIRST >> ~/.konduktor/tmp/apt-install.log;
172
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $INSTALL_FIRST 2>&1 | tee -a ~/.konduktor/tmp/apt-install.log;
129
173
  fi;
130
174
 
131
175
  if [ ! -z "$MISSING_PACKAGES" ]; then
132
176
  $(prefix_cmd) echo "Installing missing packages: $MISSING_PACKAGES";
133
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES >> ~/.konduktor/tmp/apt-install.log;
177
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES 2>&1 | tee -a ~/.konduktor/tmp/apt-install.log;
134
178
  fi;
135
179
  end_epoch=$(date +%s);
136
180
 
181
+ {% if enable_ssh %}
182
+
183
+ function InstallSSH {
184
+ export DEBIAN_FRONTEND=noninteractive
185
+ export TZ=Etc/UTC
186
+ if service sshd status > /dev/null 2>&1; then
187
+ $(prefix_cmd) echo "OpenSSH server is already started."
188
+ return
189
+ fi
190
+ # Check if OpenSSH server is already installed
191
+ if ! command -v sshd &> /dev/null; then
192
+ $(prefix_cmd) echo "OpenSSH server is not installed. Installing..."
193
+
194
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt update 2>&1 >> ~/.konduktor/tmp/apt-install.log;
195
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt install -y openssh-server >> ~/.konduktor/tmp/apt-install.log;
196
+
197
+ $(prefix_cmd) echo "OpenSSH server installation complete."
198
+ else
199
+ $(prefix_cmd) echo "OpenSSH server is already installed."
200
+ fi
201
+
202
+ # Set root password if SSHKEY is provided
203
+ # Enable root login in SSH configuration
204
+ $(prefix_cmd) sed -i '/^#PermitRootLogin/c\PermitRootLogin yes' /etc/ssh/sshd_config
205
+ $(prefix_cmd) sed -i '/^PermitRootLogin/c\PermitRootLogin yes' /etc/ssh/sshd_config
206
+ $(prefix_cmd) echo "Root login is enabled."
207
+
208
+ # Create the .ssh directory and authorized_keys file if they don't exist
209
+ if [ ! -d "$HOME/.ssh" ]; then
210
+ $(prefix_cmd) mkdir -p "$HOME/.ssh"
211
+ $(prefix_cmd) chmod 0700 "$HOME/.ssh"
212
+ $(prefix_cmd) echo "Directory $HOME/.ssh created."
213
+ fi
214
+ if [ ! -f "$HOME/.ssh/authorized_keys" ]; then
215
+ $(prefix_cmd) touch "$HOME/.ssh/authorized_keys"
216
+ $(prefix_cmd) chmod 0600 "$HOME/.ssh/authorized_keys"
217
+ $(prefix_cmd) echo "File $HOME/.ssh/authorized_keys created."
218
+ fi
219
+ # Check if the public key is not already present in authorized_keys
220
+ if ! grep -q "${KONDUKTOR_SSHPUB}" "$HOME/.ssh/authorized_keys"; then
221
+ # Append the public key to authorized_keys
222
+ $(prefix_cmd) echo "${KONDUKTOR_SSHPUB}" >> "$HOME/.ssh/authorized_keys"
223
+ $(prefix_cmd) echo "Public key added."
224
+ fi
225
+ if [ ! -f "$HOME/.ssh/konduktor-key" ]; then
226
+ # create the private key to authorized_keys
227
+ $(prefix_cmd) touch "$HOME/.ssh/konduktor-key"
228
+ $(prefix_cmd) chmod 0600 "$HOME/.ssh/konduktor-key"
229
+ $(prefix_cmd) echo "${KONDUKTOR_SSHPRIV}" >> "$HOME/.ssh/konduktor-key"
230
+ $(prefix_cmd) echo "private key added."
231
+ fi
232
+ if [ ! -f "$HOME/.ssh/config" ]; then
233
+ # create the private key to authorized_keys
234
+ $(prefix_cmd) touch "$HOME/.ssh/config"
235
+ $(prefix_cmd) chmod 0600 "$HOME/.ssh/config"
236
+ $(prefix_cmd) printf '\nHost *\n StrictHostKeyChecking no\n' >> "$HOME/.ssh/config"
237
+ $(prefix_cmd) echo "ssh config set"
238
+ fi
239
+
240
+ # turn off PAM to fix sshd login issue
241
+ $(prefix_cmd) sed -i 's/UsePAM yes/UsePAM no/' /etc/ssh/sshd_config
242
+
243
+ # set default port to 2222
244
+ $(prefix_cmd) sed -i 's/#Port 22/Port 2222/' /etc/ssh/sshd_config
245
+
246
+ echo "Exposing ENV variables"
247
+ env -0 | awk -v RS='\0' '
248
+ {
249
+ gsub(/\\/,"\\\\"); # escape existing backslashes first
250
+ gsub(/"/,"\\\""); # escape any double quotes
251
+ gsub(/\n/,"\\n"); # turn real newlines into the two characters \n
252
+ sub(/=/,"=\""); # open the value-quoting
253
+ print $0 "\""; # close the quote and add a newline record separator
254
+ }
255
+ ' > /etc/environment
256
+ echo "set -a; source /etc/environment; set +a;" >> /root/.bashrc
257
+
258
+ $(prefix_cmd) mkdir /run/sshd
259
+ $(prefix_cmd) chmod 0755 /run/sshd
260
+
261
+ $(prefix_cmd) service ssh start
262
+ $(prefix_cmd) echo "sshd service started"
263
+ }
264
+
265
+ InstallSSH
266
+ {% endif %}
137
267
  {% if tailscale_secret %}
138
- if ! command -v tailscale >/dev/null 2>&1; then
139
- export TS_HOSTNAME=$(echo "$POD_NAME" | sed 's/-[^-]*$//')
140
- $(prefix_cmd) curl -fsSL https://tailscale.com/install.sh | DEBIAN_FRONTEND=noninteractive $(prefix_cmd) sh >> ~/.konduktor/tmp/tailscale-install.log
141
- $(prefix_cmd) tailscaled --tun=userspace-networking >/dev/null 2>&1 &
142
- $(prefix_cmd) tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME} >/dev/null 2>&1
143
- fi
268
+ function InstallTailscale {
269
+ if ! command -v tailscale >/dev/null 2>&1; then
270
+ export TS_HOSTNAME=$(echo "$POD_NAME" | sed 's/-[^-]*$//')
271
+ $(prefix_cmd) curl -fsSL https://tailscale.com/install.sh | DEBIAN_FRONTEND=noninteractive $(prefix_cmd) sh > ~/.konduktor/tmp/tailscale-install.log 2>&1
272
+ $(prefix_cmd) tailscaled --tun=userspace-networking >/dev/null 2>&1 &
273
+ while ! tailscale status >/dev/null 2>&1; do
274
+ $(prefix_cmd) timeout 5 tailscale up --auth-key=${TS_AUTHKEY} --ssh --hostname=${TS_HOSTNAME}
275
+ sleep 1
276
+ done
277
+ fi
278
+ }
279
+ InstallTailscale &
144
280
  {% endif %}
145
281
  end_epoch=$(date +%s);
146
282
 
@@ -182,14 +318,42 @@ kubernetes:
182
318
  end_setup_time=$((end_epoch - start_setup));
183
319
  ulimit -Sc 0 && ulimit -Hc 0
184
320
  $(prefix_cmd) echo "===== KONDUKTOR: Initialization took $end_setup_time seconds ====="
185
- # run task
321
+ set +eo pipefail
186
322
  $(prefix_cmd) cd {{ remote_workdir }}
323
+ {% if setup_cmd %}
324
+ # setup task
325
+ $(prefix_cmd) echo "===== KONDUKTOR: Running setup ======="
326
+ {{ setup_cmd | indent( width=14 ) }}
327
+ {% endif %}
328
+
329
+ # synchronize workers before executing `run`
330
+ set -e
331
+ touch "/tmp/konduktor/SETUP"
332
+ # TODO(asaiacai): should we make this value tuneable for users?
333
+ TIMEOUT=3600
334
+ start_sync=$(date +%s);
335
+ DEADLINE=$(( $(date +%s) + TIMEOUT ))
336
+
337
+ echo "[KONDUKTOR: main] Waiting for workers to synchronize"
338
+ while [ ! -f "/tmp/konduktor/READY" ]; do
339
+ if [ "$(date +%s)" -ge "$DEADLINE" ]; then
340
+ echo "[KONDUKTOR: main] ERROR: Timed out after 2 minutes of waiting for worker synchronization"
341
+ exit 1
342
+ fi
343
+ sleep 0.5
344
+ done
345
+ echo "[KONDUKTOR: main] All workers have joined"
346
+ end_sync=$(date +%s);
347
+ echo "[KONDUKTOR: main] Synchronization took $((end_sync - start_sync)) seconds"
187
348
  set +eo pipefail
349
+ # run task
350
+ $(prefix_cmd) cd {{ remote_workdir }}
188
351
  $(prefix_cmd) echo "===== KONDUKTOR: Running task ====="
189
352
  start_epoch=$(date +%s);
190
353
  {{ run_cmd | indent( width=14 ) }}
191
354
  end_epoch=$(date +%s);
192
355
  exit_code=$?
356
+ set +ex
193
357
  $(prefix_cmd) echo "===== KONDUKTOR: Running task took $((end_epoch - start_epoch)) seconds and finished with exit code: $exit_code ====="
194
358
  exit $exit_code
195
359
  resources:
@@ -216,6 +380,8 @@ kubernetes:
216
380
  emptyDir:
217
381
  medium: "Memory"
218
382
  sizeLimit: 4Gi
383
+ - name: sync
384
+ emptyDir: {}
219
385
  {% for secret_type, secret_name in mount_secrets.items() %}
220
386
  - name: {{ secret_type }}-secret
221
387
  secret:
@@ -43,7 +43,7 @@ def is_safe_exception(exc: Exception) -> bool:
43
43
  return True
44
44
 
45
45
  # Konduktor's own exceptions
46
- if module.startswith('sky.'):
46
+ if module.startswith('konduktor.'):
47
47
  return True
48
48
 
49
49
  return False
@@ -130,6 +130,10 @@ class CommandError(Exception):
130
130
  pass
131
131
 
132
132
 
133
+ class CreateSecretError(Exception):
134
+ pass
135
+
136
+
133
137
  class MissingSecretError(Exception):
134
138
  pass
135
139
 
@@ -54,7 +54,7 @@ NO_ACCELERATOR_HELP_MESSAGE = (
54
54
  '(e.g. `nvidia.com/gpu` are setup correctly. '
55
55
  )
56
56
 
57
- _K8S_CLIENT_LOCK_PATH = '~/.konduktor/k8s_client.lock'
57
+ _K8S_CLIENT_LOCK_PATH = os.path.expanduser('~/.konduktor/k8s_client.lock')
58
58
  _K8s_CLIENT_LOCK = filelock.FileLock(_K8S_CLIENT_LOCK_PATH)
59
59
 
60
60
  logger = logging.get_logger(__name__)
@@ -578,11 +578,14 @@ def set_secret(
578
578
  secret_name: str,
579
579
  namespace: str,
580
580
  context: Optional[str],
581
- secret_key: str,
582
- secret_value: str,
581
+ data: Dict[str, str],
583
582
  ) -> Tuple[bool, Optional[str]]:
584
583
  """
585
584
  Create/update a secret in a namespace. Values are encoded to base64.
585
+ `secret` must be base64 encoded ie
586
+ ```
587
+ base64.b64encode(secret).decode()
588
+ ```
586
589
  """
587
590
  with _K8s_CLIENT_LOCK:
588
591
  secret_exists, response = check_secret_exists(
@@ -598,7 +601,7 @@ def set_secret(
598
601
  secret = kubernetes.client.V1Secret(
599
602
  metadata=kubernetes.client.V1ObjectMeta(**secret_metadata),
600
603
  type='Opaque',
601
- data={secret_key: secret_value},
604
+ data=data,
602
605
  )
603
606
 
604
607
  try:
@@ -100,8 +100,6 @@ def _handle_io_stream(io_stream, out_stream, args: _ProcessingArgs):
100
100
  start_streaming_flag = True
101
101
  if args.end_streaming_at is not None and args.end_streaming_at in line:
102
102
  # Keep executing the loop, only stop streaming.
103
- # E.g., this is used for `sky bench` to hide the
104
- # redundant messages of `sky launch` while
105
103
  # saving them in log files.
106
104
  end_streaming_flag = True
107
105
  if args.stream_logs and start_streaming_flag and not end_streaming_flag:
@@ -67,7 +67,8 @@ def tail_loki_logs_ws(
67
67
  logger.debug(f'Loki URL: {loki_url}')
68
68
  params = {
69
69
  'query': urllib.parse.quote(
70
- f'{{k8s_job_name="{job_name}-workers-0"}} '
70
+ r'{' + f'k8s_job_name="{job_name}-workers-0",'
71
+ r' k8s_container_name="konduktor-container"} '
71
72
  f' | batch_kubernetes_io_job_completion_index = `{worker_id}`'
72
73
  ),
73
74
  'limit': num_logs,
@@ -103,7 +103,7 @@ class RichSafeStreamHandler(logging.StreamHandler):
103
103
 
104
104
 
105
105
  def force_update_status(msg: str):
106
- """Update the status message even if sky_logging.is_silent() is true."""
106
+ """Update the status message even if konduktor_logging.is_silent() is true."""
107
107
  if threading.current_thread() is threading.main_thread() and _status is not None:
108
108
  _status.update(msg)
109
109
 
@@ -87,6 +87,7 @@ def _get_single_resources_schema():
87
87
  '_cluster_config_overrides': {
88
88
  'type': 'object',
89
89
  },
90
+ 'job_config': {'type': 'object'},
90
91
  },
91
92
  }
92
93
 
@@ -153,8 +154,6 @@ def get_resources_schema():
153
154
  'items': multi_resources_schema,
154
155
  },
155
156
  },
156
- # Avoid job_recovery and spot_recovery being present at the same time.
157
- **_check_not_both_fields_present('job_recovery', 'spot_recovery'),
158
157
  }
159
158
 
160
159
 
@@ -337,84 +336,6 @@ def get_cluster_schema():
337
336
  }
338
337
 
339
338
 
340
- _NETWORK_CONFIG_SCHEMA = {
341
- 'vpc_name': {
342
- 'oneOf': [
343
- {
344
- 'type': 'string',
345
- },
346
- {
347
- 'type': 'null',
348
- },
349
- ],
350
- },
351
- 'use_internal_ips': {
352
- 'type': 'boolean',
353
- },
354
- 'ssh_proxy_command': {
355
- 'oneOf': [
356
- {
357
- 'type': 'string',
358
- },
359
- {
360
- 'type': 'null',
361
- },
362
- {
363
- 'type': 'object',
364
- 'required': [],
365
- 'additionalProperties': {
366
- 'anyOf': [
367
- {'type': 'string'},
368
- {'type': 'null'},
369
- ]
370
- },
371
- },
372
- ]
373
- },
374
- }
375
-
376
- _LABELS_SCHEMA = {
377
- # Deprecated: 'instance_tags' is replaced by 'labels'. Keeping for backward
378
- # compatibility. Will be removed after 0.8.0.
379
- 'instance_tags': {
380
- 'type': 'object',
381
- 'required': [],
382
- 'additionalProperties': {
383
- 'type': 'string',
384
- },
385
- },
386
- 'labels': {
387
- 'type': 'object',
388
- 'required': [],
389
- 'additionalProperties': {
390
- 'type': 'string',
391
- },
392
- },
393
- }
394
-
395
- _PRORPERTY_NAME_OR_CLUSTER_NAME_TO_PROPERTY = {
396
- 'oneOf': [
397
- {'type': 'string'},
398
- {
399
- # A list of single-element dict to pretain the
400
- # order.
401
- # Example:
402
- # property_name:
403
- # - my-cluster1-*: my-property-1
404
- # - my-cluster2-*: my-property-2
405
- # - "*"": my-property-3
406
- 'type': 'array',
407
- 'items': {
408
- 'type': 'object',
409
- 'additionalProperties': {'type': 'string'},
410
- 'maxProperties': 1,
411
- 'minProperties': 1,
412
- },
413
- },
414
- ]
415
- }
416
-
417
-
418
339
  class RemoteIdentityOptions(enum.Enum):
419
340
  """Enum for remote identity types.
420
341
 
@@ -454,9 +375,8 @@ _REMOTE_IDENTITY_SCHEMA_KUBERNETES = {
454
375
 
455
376
  def get_storage_schema():
456
377
  # pylint: disable=import-outside-toplevel
457
- from knoduktor.registry import registry
458
-
459
378
  from konduktor.data import storage
379
+ from konduktor.registry import registry
460
380
 
461
381
  return {
462
382
  '$schema': 'https://json-schema.org/draft/2020-12/schema',
@@ -496,6 +416,21 @@ def get_storage_schema():
496
416
  }
497
417
 
498
418
 
419
+ def get_job_schema():
420
+ """Schema for a job spec, which is defined under resources."""
421
+ return {
422
+ '$schema': 'https://json-schema.org/draft/2020-12/schema',
423
+ 'type': 'object',
424
+ 'required': [],
425
+ 'additionalProperties': False,
426
+ 'properties': {
427
+ 'completions': {
428
+ 'type': 'number',
429
+ },
430
+ },
431
+ }
432
+
433
+
499
434
  def get_config_schema():
500
435
  # pylint: disable=import-outside-toplevel
501
436
  from konduktor.data import registry
@@ -574,6 +509,17 @@ def get_config_schema():
574
509
  },
575
510
  }
576
511
 
512
+ ssh_configs = {
513
+ 'type': 'object',
514
+ 'required': [],
515
+ 'additionalProperties': False,
516
+ 'properties': {
517
+ 'enable': {
518
+ 'type': 'boolean',
519
+ },
520
+ },
521
+ }
522
+
577
523
  for cloud, config in cloud_configs.items():
578
524
  if cloud == 'kubernetes':
579
525
  config['properties'].update(_REMOTE_IDENTITY_SCHEMA_KUBERNETES)
@@ -589,6 +535,7 @@ def get_config_schema():
589
535
  'nvidia_gpus': gpu_configs,
590
536
  'allowed_clouds': allowed_clouds,
591
537
  'tailscale': tailscale_configs,
538
+ 'ssh': ssh_configs,
592
539
  **cloud_configs,
593
540
  },
594
541
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250513105010
3
+ Version: 0.1.0.dev20250515104942
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -1,14 +1,15 @@
1
- konduktor/__init__.py,sha256=EeKQlgZ2Urm7c4G5OTI2uDLP-6VyEk3QkFrcEbzIkI0,1540
1
+ konduktor/__init__.py,sha256=Dz34neDq8Q0MFGTtlBPBW9OnytuxHIBA6KHubrKBi2g,1540
2
2
  konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  konduktor/adaptors/aws.py,sha256=s47Ra-GaqCQibzVfmD0pmwEWHif1EGO5opMbwkLxTCU,8244
4
- konduktor/adaptors/common.py,sha256=uTdpKvgBSwYMmynx9wR5kiZQyTrdaw9ZI4KH6Z2E5Hw,4296
4
+ konduktor/adaptors/common.py,sha256=ZIqzjx77PIHUwpjfAQ1uX8B2aX78YMuGj4Bppd-MdyM,4183
5
5
  konduktor/adaptors/gcp.py,sha256=ierTF4z7vwpJ9BsC7LSiwv4uLcjGXscwZOwQrddr2vM,4102
6
+ konduktor/authentication.py,sha256=_mVy3eqoKohicHostFiGwG1-2ybxP-l7ouofQ0LRlCY,4570
6
7
  konduktor/backends/__init__.py,sha256=1Q6sqqdeMYarpTX_U-QVywJYf7idiUTRsyP-E4BQSOw,129
7
8
  konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
8
- konduktor/backends/jobset.py,sha256=veptYGXtk-ugWxBsBV5SnqI4rGKOlGfm_N3wApvNhSQ,8326
9
- konduktor/backends/jobset_utils.py,sha256=B0N0sx-pWF9_CDeuSXAU4nm3ZIwroyVcq6aUAlNZZRs,18376
9
+ konduktor/backends/jobset.py,sha256=UdhwAuZODLMbLY51Y2zOBsh6wg4Pb84oHVvUKzx3Z2w,8434
10
+ konduktor/backends/jobset_utils.py,sha256=4vMYOhTENfBL9khzFuj69-Vy4g0sBkUpXX-1bfPnVys,20054
10
11
  konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
11
- konduktor/cli.py,sha256=Ii9-2mrc-1f2ksLasA-xRb-JnEi_9ZeCXZ3lJ1GG8H8,23515
12
+ konduktor/cli.py,sha256=Fl1dwNB5T-kDQAlAoOJetzl6RYt9FYUlowKjbNhVjkQ,23412
12
13
  konduktor/config.py,sha256=J50JxC6MsXMnlrJPXdDUMr38C89xvOO7mR8KJ6fyils,15520
13
14
  konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
14
15
  konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,27 +51,27 @@ konduktor/dashboard/frontend/server.js,sha256=jcp6_Ww9YJD3uKY07jR3KMlAM6n1QZdxZn
50
51
  konduktor/dashboard/frontend/tailwind.config.js,sha256=fCnc48wvioIDOe5ldQ_6RE7F76cP7aU7pDrxBPJx-Fk,366
51
52
  konduktor/data/__init__.py,sha256=KMR2i3E9YcIpiIuCxtRdS7BQ1w2vUAbbve7agziJrLo,213
52
53
  konduktor/data/aws/__init__.py,sha256=_6zWfNNAK1QGgyKqg_yPYWcXlnffchyvIMErYa6tw_U,331
53
- konduktor/data/aws/s3.py,sha256=2hvbgZ9NuwXY88blxfdjSbONSXcyWF0CtheDZkMYorQ,48296
54
+ konduktor/data/aws/s3.py,sha256=T4FnCxilNp35bsgmE7j5O3j15FVbgWRdUH8YFXCiwSw,48335
54
55
  konduktor/data/constants.py,sha256=yXVEoTI2we1xOjVSU-bjRCQCLpVvpEvJ0GedXvSwEfw,127
55
- konduktor/data/data_utils.py,sha256=yrnu8_cY63TXqfWfFG3yqY2w_tE9UQK9jIQAFQCDVg0,9668
56
+ konduktor/data/data_utils.py,sha256=IG1jgb_La997wi90xCvxYYsHQRlmm8Aooq04ZSf8EDI,9670
56
57
  konduktor/data/gcp/__init__.py,sha256=rlQxACBC_Vu36mdgPyJgUy4mGc_6Nt_a96JAuaPz2pQ,489
57
58
  konduktor/data/gcp/constants.py,sha256=dMfOiFccM8O6rUi9kClJcbvw1K1VnS1JzzQk3apq8ho,1483
58
- konduktor/data/gcp/gcs.py,sha256=kDbUzf8ALYzsw_G3sBRn_enQ8fjI-UKV0jeWuFZiULA,42018
59
+ konduktor/data/gcp/gcs.py,sha256=nqhCvQuGpHFPoxT5SKgxL25KtZuSg377Nh1bICiQwlc,42057
59
60
  konduktor/data/gcp/utils.py,sha256=FJQcMXZqtMIzjZ98b3lTTc0UbdPUKTDLsOsfJaaH5-s,214
60
61
  konduktor/data/registry.py,sha256=CUbMsN_Q17Pf4wRHkqZrycErEjTP7cLEdgcfwVGcEpc,696
61
- konduktor/data/storage.py,sha256=SDKRWDd7PCT9ytuz4cH0CejZj5QmWG_EZhUMVoTzWsc,35308
62
+ konduktor/data/storage.py,sha256=o2So-bY9glvgbGdoN7AQNYmNnvGf1AUDPpImtadRL90,35213
62
63
  konduktor/data/storage_utils.py,sha256=n4GivkN0KMqmyOTDznF0Z-hzsJvm7KCEh5i5HgFAT-4,20806
63
- konduktor/execution.py,sha256=UaHUdBmDaIYgiAXkRKJQOHniYPVIR4sr4yUbIqpgMrQ,18401
64
- konduktor/kube_client.py,sha256=aqwjDfNSneB5NOxV6CtqhkBeNl0UQNUt730R3ujG9Ow,6156
64
+ konduktor/execution.py,sha256=NCl2bgo5p1ZZl8HLaXT-juAe9PXr-iCJv0md2sT7A20,18395
65
+ konduktor/kube_client.py,sha256=lC-U_1hLRG3mDN8tBxYc4VZ3BS5BzKm8hlt-lE3505A,5938
65
66
  konduktor/logging.py,sha256=mBCoCTNhDEkUxd4tsse4mw-aVzSGohhXYf16ViR0ch4,2722
66
67
  konduktor/manifests/controller_deployment.yaml,sha256=6p3oSLkEVONZsvKZGqVop0Dhn4bo3lrigRmhf8NXBHE,1730
67
68
  konduktor/manifests/dashboard_deployment.yaml,sha256=xJLd4FbPMAosI0fIv5_8y7dV9bw0Vsf81l-w4MB_aU8,2837
68
69
  konduktor/manifests/dmesg_daemonset.yaml,sha256=pSWt7YOeTYjS0l0iki1fvHOs7MhY-sH-RQfVW6JJyno,1391
69
70
  konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw1vcq9_3gwHCu75Bi0A,3939
70
- konduktor/resource.py,sha256=68z8gC8Ivqktwv0R6ylMn9ZNocgkcRT0yIRGGKOdwcM,18491
71
- konduktor/task.py,sha256=Vu1TzYtLvSBz-HyHY2gsM2cMcUhMNQu44L3CWmYRXKE,35232
71
+ konduktor/resource.py,sha256=w2PdIrmQaJWA-GLSmVBcg4lxwuxvPulz35_YSKa5o24,19254
72
+ konduktor/task.py,sha256=ofwd8WIhfD6C3ThLcv6X3GUzQHyZ6ddjUagE-umF4K0,35207
72
73
  konduktor/templates/jobset.yaml.j2,sha256=onYiHtXAgk-XBtji994hPu_g0hxnLzvmfxwjbdKdeZc,960
73
- konduktor/templates/pod.yaml.j2,sha256=s3eECjLevUWR-zvyeI8WjQWxQYJh_AMk1tdQVGNXpEM,9835
74
+ konduktor/templates/pod.yaml.j2,sha256=xg0BiATrBtL7A5TkU_ndh26WZqnnkf-jtXadJ1BILHc,17343
74
75
  konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
75
76
  konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
76
77
  konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -80,18 +81,18 @@ konduktor/utils/base64_utils.py,sha256=mF-Tw98mFRG70YE4w6s9feuQSCYZHOb8YatBZwMug
80
81
  konduktor/utils/common_utils.py,sha256=F5x7k4AdBB44u8PYRkaugORnZKnK3JLqGn1jHOKgUYo,14960
81
82
  konduktor/utils/constants.py,sha256=1DneiTR21lvKUcWdBGwC4I4fD4uPjbjLUilEnJS7rzA,216
82
83
  konduktor/utils/env_options.py,sha256=T41Slzf4Mzl-n45CGXXqdy2fCrYhPNZQ7RP5vmnN4xc,2258
83
- konduktor/utils/exceptions.py,sha256=IHyaP5ERZpPvWZeKWV3MVTyKsxo2Fq-13nhI0PRNQzk,6629
84
+ konduktor/utils/exceptions.py,sha256=5IFnN5bIUSBJv4KRRrCepk5jyY9EG5vWWQqbjCmP3NU,6682
84
85
  konduktor/utils/kubernetes_enums.py,sha256=SabUueF6Bpzbpa57gyH5VB65xla2N9l8CZmAeYTfGmM,176
85
- konduktor/utils/kubernetes_utils.py,sha256=ivFVh90Gez19_JD5U4bgCO5zNtQUflF0hJsM5nZLj8A,23864
86
- konduktor/utils/log_utils.py,sha256=lgHCq4OdtJNfbpso-uYGONUCVNsUrUkUWjROarsHt6s,9897
87
- konduktor/utils/loki_utils.py,sha256=ND1pbbbFhLhLKw3870j44LpR_9MB0EkDJSs5K7nWdY4,3473
88
- konduktor/utils/rich_utils.py,sha256=kdjNe6S2LlpOxyzhFHqMzCz7g4ROC4e7TPWgcbRsrQE,3577
89
- konduktor/utils/schemas.py,sha256=_VCWnsSgyP3u5cpACEmJeuqcy5mzu_fr0McHyZdiXd8,17757
86
+ konduktor/utils/kubernetes_utils.py,sha256=1MZHwU4vy-exA4TA5_oTiV-zm1A2ayfeA0T_75DMFM8,23937
87
+ konduktor/utils/log_utils.py,sha256=oFCKkYKCS_e_GRw_-0F7WsiIZNqJL1RZ4cD5-zh59Q4,9765
88
+ konduktor/utils/loki_utils.py,sha256=h2ZvZQr1nE_wXXsKsGMjhG2s2MXknNd4icydTR_ruKU,3539
89
+ konduktor/utils/rich_utils.py,sha256=ycADW6Ij3wX3uT8ou7T8qxX519RxlkJivsLvUahQaJo,3583
90
+ konduktor/utils/schemas.py,sha256=2fHsTi3t9q3LXqOPrcpkmPsMbaoJBnuJstd6ULmDiUo,16455
90
91
  konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
91
92
  konduktor/utils/ux_utils.py,sha256=czCwiS1bDqgeKtzAJctczpLwFZzAse7WuozdvzEFYJ4,7437
92
93
  konduktor/utils/validator.py,sha256=tgBghVyedyzGx84-U2Qfoh_cJBE3oUk9gclMW90ORks,691
93
- konduktor_nightly-0.1.0.dev20250513105010.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
94
- konduktor_nightly-0.1.0.dev20250513105010.dist-info/METADATA,sha256=pQRlD0RZHFood4bguE1TiBn_ucsLumDQdxN82trZ7xc,4366
95
- konduktor_nightly-0.1.0.dev20250513105010.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
96
- konduktor_nightly-0.1.0.dev20250513105010.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
97
- konduktor_nightly-0.1.0.dev20250513105010.dist-info/RECORD,,
94
+ konduktor_nightly-0.1.0.dev20250515104942.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
95
+ konduktor_nightly-0.1.0.dev20250515104942.dist-info/METADATA,sha256=ikZ6zhXDw6OHLbu8sswo0tQh_J7S1kRDMJV0cwc4aI4,4366
96
+ konduktor_nightly-0.1.0.dev20250515104942.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
97
+ konduktor_nightly-0.1.0.dev20250515104942.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
98
+ konduktor_nightly-0.1.0.dev20250515104942.dist-info/RECORD,,