konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
konduktor/__init__.py CHANGED
@@ -3,12 +3,22 @@
3
3
  import os
4
4
  import subprocess
5
5
 
6
+ from konduktor.execution import launch
7
+ from konduktor.resource import Resources
8
+ from konduktor.task import Task
9
+
10
+ __all__ = [
11
+ 'launch',
12
+ 'Resources',
13
+ 'Task',
14
+ ]
15
+
6
16
  # Replaced with the current commit when building the wheels.
7
- _KONDUKTOR_COMMIT_SHA = "10848f624606ce6072ec6c646c4b3a6cb347585a"
17
+ _KONDUKTOR_COMMIT_SHA = '9d16b4c393e6b4dd288eca4ea6baf66591089b91'
8
18
 
9
19
 
10
20
  def _get_git_commit():
11
- if "KONDUKTOR_COMMIT_SHA" not in _KONDUKTOR_COMMIT_SHA:
21
+ if 'KONDUKTOR_COMMIT_SHA' not in _KONDUKTOR_COMMIT_SHA:
12
22
  # This is a release build, so we don't need to get the commit hash from
13
23
  # git, as it's already been set.
14
24
  return _KONDUKTOR_COMMIT_SHA
@@ -18,24 +28,24 @@ def _get_git_commit():
18
28
  try:
19
29
  cwd = os.path.dirname(__file__)
20
30
  commit_hash = subprocess.check_output(
21
- ["git", "rev-parse", "HEAD"],
31
+ ['git', 'rev-parse', 'HEAD'],
22
32
  cwd=cwd,
23
33
  universal_newlines=True,
24
34
  stderr=subprocess.DEVNULL,
25
35
  ).strip()
26
36
  changes = subprocess.check_output(
27
- ["git", "status", "--porcelain"],
37
+ ['git', 'status', '--porcelain'],
28
38
  cwd=cwd,
29
39
  universal_newlines=True,
30
40
  stderr=subprocess.DEVNULL,
31
41
  ).strip()
32
42
  if changes:
33
- commit_hash += "-dirty"
43
+ commit_hash += '-dirty'
34
44
  return commit_hash
35
45
  except Exception: # pylint: disable=broad-except
36
46
  return _KONDUKTOR_COMMIT_SHA
37
47
 
38
48
 
39
49
  __commit__ = _get_git_commit()
40
- __version__ = "1.0.0-dev0"
50
+ __version__ = '1.0.0.dev0.1.0.dev20250313070642'
41
51
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
File without changes
@@ -0,0 +1,88 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Lazy import for modules to avoid import error when not used."""
14
+
15
+ import functools
16
+ import importlib
17
+ import threading
18
+ from typing import Any, Callable, Optional, Tuple
19
+
20
+
21
+ class LazyImport:
22
+ """Lazy importer for heavy modules or cloud modules only when enabled.
23
+
24
+ We use this for pandas and networkx, as they can be time-consuming to import
25
+ (0.1-0.2 seconds). With this class, we can avoid the unnecessary import time
26
+ when the module is not used (e.g., `networkx` should not be imported for
27
+ `sky status and `pandas` should not be imported for `sky exec`).
28
+
29
+ We also use this for cloud adaptors, because we do not want to import the
30
+ cloud dependencies when it is not enabled.
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ module_name: str,
36
+ import_error_message: Optional[str] = None,
37
+ set_loggers: Optional[Callable] = None,
38
+ ):
39
+ self._module_name = module_name
40
+ self._module = None
41
+ self._import_error_message = import_error_message
42
+ self._set_loggers = set_loggers
43
+ self._lock = threading.RLock()
44
+
45
+ def load_module(self):
46
+ # Avoid extra imports when multiple threads try to import the same
47
+ # module. The overhead is minor since import can only run in serial
48
+ # due to GIL even in multi-threaded environments.
49
+ with self._lock:
50
+ if self._module is None:
51
+ try:
52
+ self._module = importlib.import_module(self._module_name)
53
+ if self._set_loggers is not None:
54
+ self._set_loggers()
55
+ except ImportError as e:
56
+ if self._import_error_message is not None:
57
+ raise ImportError(self._import_error_message) from e
58
+ raise
59
+ return self._module
60
+
61
+ def __getattr__(self, name: str) -> Any:
62
+ # Attempt to access the attribute, if it fails, assume it's a submodule
63
+ # and lazily import it
64
+ try:
65
+ if name in self.__dict__:
66
+ return self.__dict__[name]
67
+ return getattr(self.load_module(), name)
68
+ except AttributeError:
69
+ # Dynamically create a new LazyImport instance for the submodule
70
+ submodule_name = f'{self._module_name}.{name}'
71
+ lazy_submodule = LazyImport(submodule_name, self._import_error_message)
72
+ setattr(self, name, lazy_submodule)
73
+ return lazy_submodule
74
+
75
+
76
+ def load_lazy_modules(modules: Tuple[LazyImport, ...]):
77
+ """Load lazy modules before entering a function to error out quickly."""
78
+
79
+ def decorator(func):
80
+ @functools.wraps(func)
81
+ def wrapper(*args, **kwargs):
82
+ for m in modules:
83
+ m.load_module()
84
+ return func(*args, **kwargs)
85
+
86
+ return wrapper
87
+
88
+ return decorator
@@ -0,0 +1,112 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """GCP cloud adaptors"""
14
+
15
+ # pylint: disable=import-outside-toplevel
16
+ import json
17
+
18
+ from konduktor.adaptors import common
19
+
20
+ _IMPORT_ERROR_MESSAGE = (
21
+ 'Failed to import dependencies for GCP. ' 'Try pip install "skypilot[gcp]"'
22
+ )
23
+ googleapiclient = common.LazyImport(
24
+ 'googleapiclient', import_error_message=_IMPORT_ERROR_MESSAGE
25
+ )
26
+ google = common.LazyImport('google', import_error_message=_IMPORT_ERROR_MESSAGE)
27
+ _LAZY_MODULES = (google, googleapiclient)
28
+
29
+
30
+ @common.load_lazy_modules(_LAZY_MODULES)
31
+ def build(service_name: str, version: str, *args, **kwargs):
32
+ """Build a GCP service.
33
+
34
+ Args:
35
+ service_name: GCP service name (e.g., 'compute', 'storagetransfer').
36
+ version: Service version (e.g., 'v1').
37
+ """
38
+
39
+ return googleapiclient.discovery.build(service_name, version, *args, **kwargs)
40
+
41
+
42
+ @common.load_lazy_modules(_LAZY_MODULES)
43
+ def storage_client():
44
+ """Helper that connects to GCS Storage Client for GCS Bucket"""
45
+ from google.cloud import storage
46
+
47
+ return storage.Client()
48
+
49
+
50
+ @common.load_lazy_modules(_LAZY_MODULES)
51
+ def anonymous_storage_client():
52
+ """Helper that connects to GCS Storage Client for Public GCS Buckets"""
53
+ from google.cloud import storage
54
+
55
+ return storage.Client.create_anonymous_client()
56
+
57
+
58
+ @common.load_lazy_modules(_LAZY_MODULES)
59
+ def not_found_exception():
60
+ """NotFound exception."""
61
+ from google.api_core import exceptions as gcs_exceptions
62
+
63
+ return gcs_exceptions.NotFound
64
+
65
+
66
+ @common.load_lazy_modules(_LAZY_MODULES)
67
+ def forbidden_exception():
68
+ """Forbidden exception."""
69
+ from google.api_core import exceptions as gcs_exceptions
70
+
71
+ return gcs_exceptions.Forbidden
72
+
73
+
74
+ @common.load_lazy_modules(_LAZY_MODULES)
75
+ def http_error_exception():
76
+ """HttpError exception."""
77
+ from googleapiclient import errors
78
+
79
+ return errors.HttpError
80
+
81
+
82
+ @common.load_lazy_modules(_LAZY_MODULES)
83
+ def credential_error_exception():
84
+ """CredentialError exception."""
85
+ from google.auth import exceptions
86
+
87
+ return exceptions.DefaultCredentialsError
88
+
89
+
90
+ @common.load_lazy_modules(_LAZY_MODULES)
91
+ def get_credentials(cred_type: str, credentials_field: str):
92
+ """Get GCP credentials."""
93
+ from google.oauth2 import service_account
94
+ from google.oauth2.credentials import Credentials as OAuthCredentials
95
+
96
+ if cred_type == 'service_account':
97
+ # If parsing the gcp_credentials failed, then the user likely made a
98
+ # mistake in copying the credentials into the config yaml.
99
+ try:
100
+ service_account_info = json.loads(credentials_field)
101
+ except json.decoder.JSONDecodeError as e:
102
+ raise RuntimeError(
103
+ 'gcp_credentials found in cluster yaml file but '
104
+ 'formatted improperly.'
105
+ ) from e
106
+ credentials = service_account.Credentials.from_service_account_info(
107
+ service_account_info
108
+ )
109
+ elif cred_type == 'credentials_token':
110
+ # Otherwise the credentials type must be credentials_token.
111
+ credentials = OAuthCredentials(credentials_field)
112
+ return credentials
@@ -0,0 +1,8 @@
1
+ """Batch job backends"""
2
+
3
+ from konduktor.backends.jobset import JobsetBackend
4
+
5
+ __all__ = [
6
+ 'Backend',
7
+ 'JobsetBackend',
8
+ ]
@@ -0,0 +1,86 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Konduktor backend interface."""
14
+
15
+ import typing
16
+ from typing import Dict, Optional
17
+
18
+ if typing.TYPE_CHECKING:
19
+ from konduktor.data import storage as storage_lib
20
+
21
+ import konduktor
22
+ from konduktor.utils import ux_utils
23
+
24
+ Path = str
25
+
26
+
27
+ class Backend:
28
+ """Backend interface: handles provisioning, setup, and scheduling."""
29
+
30
+ # NAME is used to identify the backend class from cli/yaml.
31
+ NAME = 'backend'
32
+
33
+ # --- APIs ---
34
+ def check_resources_fit_cluster(self, task: 'konduktor.Task') -> bool:
35
+ """Check whether resources of the task are satisfied by cluster."""
36
+ raise NotImplementedError
37
+
38
+ def sync_workdir(self, workdir: Path) -> None:
39
+ return self._sync_workdir(workdir)
40
+
41
+ def sync_file_mounts(
42
+ self,
43
+ all_file_mounts: Optional[Dict[Path, Path]],
44
+ storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
45
+ ) -> None:
46
+ return self._sync_file_mounts(all_file_mounts, storage_mounts)
47
+
48
+ def add_storage_objects(self, task: 'konduktor.Task') -> None:
49
+ raise NotImplementedError
50
+
51
+ def execute(
52
+ self, task: 'konduktor.Task', detach_run: bool, dryrun: bool = False
53
+ ) -> Optional[str]:
54
+ """Execute the task on the cluster.
55
+
56
+ Returns:
57
+ Job id if the task is submitted to the cluster, None otherwise.
58
+ """
59
+ ux_utils.spinner_message('Submitting job')
60
+ return self._execute(task, detach_run, dryrun)
61
+
62
+ def post_execute(self) -> None:
63
+ """Post execute(): e.g., print helpful inspection messages."""
64
+ return self._post_execute()
65
+
66
+ def register_info(self, **kwargs) -> None:
67
+ """Register backend-specific information."""
68
+ pass
69
+
70
+ def _sync_workdir(self, workdir: Path) -> None:
71
+ raise NotImplementedError
72
+
73
+ def _sync_file_mounts(
74
+ self,
75
+ all_file_mounts: Optional[Dict[Path, Path]],
76
+ storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
77
+ ) -> None:
78
+ raise NotImplementedError
79
+
80
+ def _execute(
81
+ self, task: 'konduktor.Task', detach_run: bool, dryrun: bool = False
82
+ ) -> Optional[str]:
83
+ raise NotImplementedError
84
+
85
+ def _post_execute(self) -> None:
86
+ raise NotImplementedError
@@ -0,0 +1,218 @@
1
+ """Batch job execution via k8s jobsets
2
+ https://jobset.sigs.k8s.io/
3
+ https://kueue.sigs.k8s.io/docs/tasks/run/jobsets/
4
+ """
5
+
6
+ import threading
7
+ import time
8
+ import typing
9
+ from typing import Dict, Optional, Tuple
10
+
11
+ import colorama
12
+
13
+ if typing.TYPE_CHECKING:
14
+ import konduktor
15
+ from konduktor.data import storage as storage_lib
16
+
17
+ from konduktor import config, logging
18
+ from konduktor.backends import backend, jobset_utils
19
+ from konduktor.utils import kubernetes_utils, loki_utils, rich_utils, ux_utils
20
+
21
+ Path = str
22
+ logger = logging.get_logger(__file__)
23
+
24
+ POLL_INTERVAL = 5
25
+ DEFAULT_ATTACH_TIMEOUT = 300 # 5 minutes
26
+ FLUSH_LOGS_TIMEOUT = 5
27
+
28
+
29
+ class JobsetError(Exception):
30
+ pass
31
+
32
+
33
+ def _raise_job_error(job):
34
+ """Checks a jobs conditions and statuses for error"""
35
+ for condition in job.status.conditions:
36
+ if 'ConfigIssue' in condition.message:
37
+ raise ValueError(
38
+ 'Job failed with '
39
+ f'{colorama.Style.BRIGHT}{colorama.Fore.RED}'
40
+ f'ConfigIssue: ErrImagePull.{colorama.Style.RESET_ALL} '
41
+ f'Check that your '
42
+ f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
43
+ f'`image_id`{colorama.Style.RESET_ALL} is correct and '
44
+ f'your container credentials are correct. Image specified '
45
+ f'in your task definition is '
46
+ f'{colorama.Style.BRIGHT}{colorama.Fore.RED}'
47
+ f'`{job.spec.template.spec.containers[0].image}`'
48
+ f'{colorama.Style.RESET_ALL}'
49
+ )
50
+ elif 'BackoffLimitExceeded' == condition.reason:
51
+ raise JobsetError('Job failed with non-zero exit code.')
52
+ logger.error(
53
+ 'Job failed with unknown error. Check jobset status in k8s with '
54
+ f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
55
+ f'`kubectl get job -o yaml {job.metadata.name}`'
56
+ f'{colorama.Style.RESET_ALL}'
57
+ )
58
+
59
+
60
+ def _wait_for_jobset_start(namespace: str, job_name: str):
61
+ start = time.time()
62
+ timeout = config.get_nested(
63
+ ('kubernetes', 'pod_config'),
64
+ default_value=DEFAULT_ATTACH_TIMEOUT,
65
+ override_configs={},
66
+ )
67
+
68
+ while True:
69
+ jobsets = jobset_utils.get_jobset(namespace, job_name)
70
+ assert jobsets is not None, (
71
+ f'Jobset {job_name} ' f'not found in namespace {namespace}'
72
+ )
73
+ if jobsets['status']['replicatedJobsStatus'][0]['ready']:
74
+ logger.info(
75
+ f'task '
76
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
77
+ f'{colorama.Style.RESET_ALL} ready'
78
+ )
79
+ break
80
+ elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
81
+ return
82
+ elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
83
+ logger.info(
84
+ f'job '
85
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
86
+ f'{colorama.Style.RESET_ALL} '
87
+ f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
88
+ )
89
+ job = jobset_utils.get_job(namespace, job_name)
90
+ _raise_job_error(job)
91
+ return
92
+ if timeout != -1 and time.time() - start > timeout:
93
+ logger.error(
94
+ f'{colorama.Style.BRIGHT}'
95
+ f'{colorama.Fore.RED}Job timed out to schedule.'
96
+ f'{colorama.Style.RESET_ALL}. Deleting job'
97
+ )
98
+ jobset_utils.delete_jobset(namespace, job_name)
99
+ raise JobsetError(
100
+ 'Job failed to start within '
101
+ f'timeout of {timeout} seconds. '
102
+ f'Increase or disable timeout '
103
+ f'{colorama.Style.BRIGHT}'
104
+ '`konduktor.provision_timeout: -1`'
105
+ f'{colorama.Style.RESET_ALL}'
106
+ )
107
+ time.sleep(POLL_INTERVAL)
108
+
109
+
110
+ def _wait_for_jobset_completion(namespace: str, job_name: str) -> Tuple[bool, str]:
111
+ while True:
112
+ jobsets = jobset_utils.get_jobset(namespace, job_name)
113
+ assert jobsets is not None, (
114
+ f'Jobset {job_name} ' f'not found in namespace {namespace}'
115
+ )
116
+ if jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
117
+ msg = (
118
+ f'task '
119
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
120
+ f'{colorama.Style.RESET_ALL} {colorama.Fore.GREEN}'
121
+ f'{colorama.Style.BRIGHT}finished{colorama.Style.RESET_ALL}'
122
+ )
123
+ return True, msg
124
+ elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
125
+ msg = (
126
+ f'task '
127
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
128
+ f'{colorama.Style.RESET_ALL} {colorama.Fore.RED}'
129
+ f'{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
130
+ )
131
+ return False, msg
132
+ time.sleep(POLL_INTERVAL)
133
+
134
+
135
+ class JobsetBackend(backend.Backend):
136
+ def _sync_file_mounts(
137
+ self,
138
+ all_file_mounts: Optional[Dict[Path, Path]],
139
+ storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
140
+ ) -> None:
141
+ """Syncs files/directories to cloud storage before job launch.
142
+
143
+ This uploads any local files/dirs to cloud storage so they can be downloaded
144
+ by the pods when they start.
145
+ """
146
+ pass
147
+
148
+ def _sync_workdir(self, workdir: str) -> None:
149
+ """Syncs the working directory to cloud storage before job launch."""
150
+
151
+ pass
152
+
153
+ def _post_execute(self) -> None:
154
+ """
155
+ TODO(asaiacai): add some helpful messages/commands that a user can run
156
+ to inspect the status of their jobset.
157
+ """
158
+ pass
159
+
160
+ def _execute(
161
+ self, task: 'konduktor.Task', detach_run: bool = False, dryrun: bool = False
162
+ ) -> Optional[str]:
163
+ """Executes the task on the cluster. By creating a jobset
164
+
165
+ Returns:
166
+ Job id if the task is submitted to the cluster, None otherwise.
167
+ """
168
+
169
+ # we should consider just building an image with the cloud provider
170
+ # sdks baked in. These can initialize and pull files first before
171
+ # the working container starts.
172
+
173
+ # first define the pod spec then create the jobset definition
174
+ pod_spec = jobset_utils.create_pod_spec(task)
175
+ context = kubernetes_utils.get_current_kube_config_context_name()
176
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
177
+ # TODO(asaiacai): need to set env variables in pod
178
+ jobset_utils.create_jobset(
179
+ namespace,
180
+ task,
181
+ pod_spec['kubernetes']['pod_config'],
182
+ dryrun=dryrun,
183
+ )
184
+
185
+ if not dryrun and not detach_run:
186
+ with ux_utils.print_exception_no_traceback():
187
+ with rich_utils.safe_status(
188
+ ux_utils.spinner_message('waiting for job to start\n')
189
+ ):
190
+ _wait_for_jobset_start(namespace, task.name)
191
+ try:
192
+ log_thread = threading.Thread(
193
+ target=loki_utils.tail_loki_logs_ws,
194
+ args=(task.name,),
195
+ daemon=True,
196
+ )
197
+ logger.info('streaming logs...')
198
+ log_thread.start()
199
+ is_success, msg = _wait_for_jobset_completion(namespace, task.name)
200
+ log_thread.join(timeout=2.0) # give the job sometime to flush logs
201
+ if not is_success:
202
+ logger.error(msg)
203
+ else:
204
+ logger.info(msg)
205
+ except KeyboardInterrupt:
206
+ logger.info('detaching from log stream...')
207
+ except Exception as err:
208
+ logger.error(
209
+ f'Check if job resources are '
210
+ f'active/queued with '
211
+ f'{colorama.Style.BRIGHT}'
212
+ f'`konduktor status`'
213
+ f'{colorama.Style.RESET_ALL}'
214
+ )
215
+ raise JobsetError(f'error: {err}')
216
+ else:
217
+ logger.info('detaching from run.')
218
+ return task.name