konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +16 -6
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/common.py +88 -0
- konduktor/adaptors/gcp.py +112 -0
- konduktor/backends/__init__.py +8 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/jobset.py +218 -0
- konduktor/backends/jobset_utils.py +447 -0
- konduktor/check.py +192 -0
- konduktor/cli.py +790 -0
- konduktor/cloud_stores.py +158 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/constants.py +6 -6
- konduktor/controller/launch.py +3 -3
- konduktor/controller/node.py +5 -5
- konduktor/controller/parse.py +23 -23
- konduktor/dashboard/backend/main.py +57 -57
- konduktor/dashboard/backend/sockets.py +19 -19
- konduktor/data/__init__.py +9 -0
- konduktor/data/constants.py +12 -0
- konduktor/data/data_utils.py +223 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +906 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/storage.py +799 -0
- konduktor/data/storage_utils.py +500 -0
- konduktor/execution.py +444 -0
- konduktor/kube_client.py +153 -48
- konduktor/logging.py +49 -5
- konduktor/manifests/dmesg_daemonset.yaml +8 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +478 -0
- konduktor/task.py +867 -0
- konduktor/templates/jobset.yaml.j2 +31 -0
- konduktor/templates/pod.yaml.j2 +185 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +21 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +93 -0
- konduktor/utils/common_utils.py +393 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +226 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +652 -0
- konduktor/utils/log_utils.py +251 -0
- konduktor/utils/loki_utils.py +85 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +581 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +216 -0
- konduktor/utils/validator.py +20 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
- konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
- konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
konduktor/__init__.py
CHANGED
@@ -3,12 +3,22 @@
|
|
3
3
|
import os
|
4
4
|
import subprocess
|
5
5
|
|
6
|
+
from konduktor.execution import launch
|
7
|
+
from konduktor.resource import Resources
|
8
|
+
from konduktor.task import Task
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
'launch',
|
12
|
+
'Resources',
|
13
|
+
'Task',
|
14
|
+
]
|
15
|
+
|
6
16
|
# Replaced with the current commit when building the wheels.
|
7
|
-
_KONDUKTOR_COMMIT_SHA =
|
17
|
+
_KONDUKTOR_COMMIT_SHA = '9d16b4c393e6b4dd288eca4ea6baf66591089b91'
|
8
18
|
|
9
19
|
|
10
20
|
def _get_git_commit():
|
11
|
-
if
|
21
|
+
if 'KONDUKTOR_COMMIT_SHA' not in _KONDUKTOR_COMMIT_SHA:
|
12
22
|
# This is a release build, so we don't need to get the commit hash from
|
13
23
|
# git, as it's already been set.
|
14
24
|
return _KONDUKTOR_COMMIT_SHA
|
@@ -18,24 +28,24 @@ def _get_git_commit():
|
|
18
28
|
try:
|
19
29
|
cwd = os.path.dirname(__file__)
|
20
30
|
commit_hash = subprocess.check_output(
|
21
|
-
[
|
31
|
+
['git', 'rev-parse', 'HEAD'],
|
22
32
|
cwd=cwd,
|
23
33
|
universal_newlines=True,
|
24
34
|
stderr=subprocess.DEVNULL,
|
25
35
|
).strip()
|
26
36
|
changes = subprocess.check_output(
|
27
|
-
[
|
37
|
+
['git', 'status', '--porcelain'],
|
28
38
|
cwd=cwd,
|
29
39
|
universal_newlines=True,
|
30
40
|
stderr=subprocess.DEVNULL,
|
31
41
|
).strip()
|
32
42
|
if changes:
|
33
|
-
commit_hash +=
|
43
|
+
commit_hash += '-dirty'
|
34
44
|
return commit_hash
|
35
45
|
except Exception: # pylint: disable=broad-except
|
36
46
|
return _KONDUKTOR_COMMIT_SHA
|
37
47
|
|
38
48
|
|
39
49
|
__commit__ = _get_git_commit()
|
40
|
-
__version__ =
|
50
|
+
__version__ = '1.0.0.dev0.1.0.dev20250313070642'
|
41
51
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
File without changes
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""Lazy import for modules to avoid import error when not used."""
|
14
|
+
|
15
|
+
import functools
|
16
|
+
import importlib
|
17
|
+
import threading
|
18
|
+
from typing import Any, Callable, Optional, Tuple
|
19
|
+
|
20
|
+
|
21
|
+
class LazyImport:
|
22
|
+
"""Lazy importer for heavy modules or cloud modules only when enabled.
|
23
|
+
|
24
|
+
We use this for pandas and networkx, as they can be time-consuming to import
|
25
|
+
(0.1-0.2 seconds). With this class, we can avoid the unnecessary import time
|
26
|
+
when the module is not used (e.g., `networkx` should not be imported for
|
27
|
+
`sky status and `pandas` should not be imported for `sky exec`).
|
28
|
+
|
29
|
+
We also use this for cloud adaptors, because we do not want to import the
|
30
|
+
cloud dependencies when it is not enabled.
|
31
|
+
"""
|
32
|
+
|
33
|
+
def __init__(
|
34
|
+
self,
|
35
|
+
module_name: str,
|
36
|
+
import_error_message: Optional[str] = None,
|
37
|
+
set_loggers: Optional[Callable] = None,
|
38
|
+
):
|
39
|
+
self._module_name = module_name
|
40
|
+
self._module = None
|
41
|
+
self._import_error_message = import_error_message
|
42
|
+
self._set_loggers = set_loggers
|
43
|
+
self._lock = threading.RLock()
|
44
|
+
|
45
|
+
def load_module(self):
|
46
|
+
# Avoid extra imports when multiple threads try to import the same
|
47
|
+
# module. The overhead is minor since import can only run in serial
|
48
|
+
# due to GIL even in multi-threaded environments.
|
49
|
+
with self._lock:
|
50
|
+
if self._module is None:
|
51
|
+
try:
|
52
|
+
self._module = importlib.import_module(self._module_name)
|
53
|
+
if self._set_loggers is not None:
|
54
|
+
self._set_loggers()
|
55
|
+
except ImportError as e:
|
56
|
+
if self._import_error_message is not None:
|
57
|
+
raise ImportError(self._import_error_message) from e
|
58
|
+
raise
|
59
|
+
return self._module
|
60
|
+
|
61
|
+
def __getattr__(self, name: str) -> Any:
|
62
|
+
# Attempt to access the attribute, if it fails, assume it's a submodule
|
63
|
+
# and lazily import it
|
64
|
+
try:
|
65
|
+
if name in self.__dict__:
|
66
|
+
return self.__dict__[name]
|
67
|
+
return getattr(self.load_module(), name)
|
68
|
+
except AttributeError:
|
69
|
+
# Dynamically create a new LazyImport instance for the submodule
|
70
|
+
submodule_name = f'{self._module_name}.{name}'
|
71
|
+
lazy_submodule = LazyImport(submodule_name, self._import_error_message)
|
72
|
+
setattr(self, name, lazy_submodule)
|
73
|
+
return lazy_submodule
|
74
|
+
|
75
|
+
|
76
|
+
def load_lazy_modules(modules: Tuple[LazyImport, ...]):
|
77
|
+
"""Load lazy modules before entering a function to error out quickly."""
|
78
|
+
|
79
|
+
def decorator(func):
|
80
|
+
@functools.wraps(func)
|
81
|
+
def wrapper(*args, **kwargs):
|
82
|
+
for m in modules:
|
83
|
+
m.load_module()
|
84
|
+
return func(*args, **kwargs)
|
85
|
+
|
86
|
+
return wrapper
|
87
|
+
|
88
|
+
return decorator
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""GCP cloud adaptors"""
|
14
|
+
|
15
|
+
# pylint: disable=import-outside-toplevel
|
16
|
+
import json
|
17
|
+
|
18
|
+
from konduktor.adaptors import common
|
19
|
+
|
20
|
+
_IMPORT_ERROR_MESSAGE = (
|
21
|
+
'Failed to import dependencies for GCP. ' 'Try pip install "skypilot[gcp]"'
|
22
|
+
)
|
23
|
+
googleapiclient = common.LazyImport(
|
24
|
+
'googleapiclient', import_error_message=_IMPORT_ERROR_MESSAGE
|
25
|
+
)
|
26
|
+
google = common.LazyImport('google', import_error_message=_IMPORT_ERROR_MESSAGE)
|
27
|
+
_LAZY_MODULES = (google, googleapiclient)
|
28
|
+
|
29
|
+
|
30
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
31
|
+
def build(service_name: str, version: str, *args, **kwargs):
|
32
|
+
"""Build a GCP service.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
service_name: GCP service name (e.g., 'compute', 'storagetransfer').
|
36
|
+
version: Service version (e.g., 'v1').
|
37
|
+
"""
|
38
|
+
|
39
|
+
return googleapiclient.discovery.build(service_name, version, *args, **kwargs)
|
40
|
+
|
41
|
+
|
42
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
43
|
+
def storage_client():
|
44
|
+
"""Helper that connects to GCS Storage Client for GCS Bucket"""
|
45
|
+
from google.cloud import storage
|
46
|
+
|
47
|
+
return storage.Client()
|
48
|
+
|
49
|
+
|
50
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
51
|
+
def anonymous_storage_client():
|
52
|
+
"""Helper that connects to GCS Storage Client for Public GCS Buckets"""
|
53
|
+
from google.cloud import storage
|
54
|
+
|
55
|
+
return storage.Client.create_anonymous_client()
|
56
|
+
|
57
|
+
|
58
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
59
|
+
def not_found_exception():
|
60
|
+
"""NotFound exception."""
|
61
|
+
from google.api_core import exceptions as gcs_exceptions
|
62
|
+
|
63
|
+
return gcs_exceptions.NotFound
|
64
|
+
|
65
|
+
|
66
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
67
|
+
def forbidden_exception():
|
68
|
+
"""Forbidden exception."""
|
69
|
+
from google.api_core import exceptions as gcs_exceptions
|
70
|
+
|
71
|
+
return gcs_exceptions.Forbidden
|
72
|
+
|
73
|
+
|
74
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
75
|
+
def http_error_exception():
|
76
|
+
"""HttpError exception."""
|
77
|
+
from googleapiclient import errors
|
78
|
+
|
79
|
+
return errors.HttpError
|
80
|
+
|
81
|
+
|
82
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
83
|
+
def credential_error_exception():
|
84
|
+
"""CredentialError exception."""
|
85
|
+
from google.auth import exceptions
|
86
|
+
|
87
|
+
return exceptions.DefaultCredentialsError
|
88
|
+
|
89
|
+
|
90
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
91
|
+
def get_credentials(cred_type: str, credentials_field: str):
|
92
|
+
"""Get GCP credentials."""
|
93
|
+
from google.oauth2 import service_account
|
94
|
+
from google.oauth2.credentials import Credentials as OAuthCredentials
|
95
|
+
|
96
|
+
if cred_type == 'service_account':
|
97
|
+
# If parsing the gcp_credentials failed, then the user likely made a
|
98
|
+
# mistake in copying the credentials into the config yaml.
|
99
|
+
try:
|
100
|
+
service_account_info = json.loads(credentials_field)
|
101
|
+
except json.decoder.JSONDecodeError as e:
|
102
|
+
raise RuntimeError(
|
103
|
+
'gcp_credentials found in cluster yaml file but '
|
104
|
+
'formatted improperly.'
|
105
|
+
) from e
|
106
|
+
credentials = service_account.Credentials.from_service_account_info(
|
107
|
+
service_account_info
|
108
|
+
)
|
109
|
+
elif cred_type == 'credentials_token':
|
110
|
+
# Otherwise the credentials type must be credentials_token.
|
111
|
+
credentials = OAuthCredentials(credentials_field)
|
112
|
+
return credentials
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""Konduktor backend interface."""
|
14
|
+
|
15
|
+
import typing
|
16
|
+
from typing import Dict, Optional
|
17
|
+
|
18
|
+
if typing.TYPE_CHECKING:
|
19
|
+
from konduktor.data import storage as storage_lib
|
20
|
+
|
21
|
+
import konduktor
|
22
|
+
from konduktor.utils import ux_utils
|
23
|
+
|
24
|
+
Path = str
|
25
|
+
|
26
|
+
|
27
|
+
class Backend:
|
28
|
+
"""Backend interface: handles provisioning, setup, and scheduling."""
|
29
|
+
|
30
|
+
# NAME is used to identify the backend class from cli/yaml.
|
31
|
+
NAME = 'backend'
|
32
|
+
|
33
|
+
# --- APIs ---
|
34
|
+
def check_resources_fit_cluster(self, task: 'konduktor.Task') -> bool:
|
35
|
+
"""Check whether resources of the task are satisfied by cluster."""
|
36
|
+
raise NotImplementedError
|
37
|
+
|
38
|
+
def sync_workdir(self, workdir: Path) -> None:
|
39
|
+
return self._sync_workdir(workdir)
|
40
|
+
|
41
|
+
def sync_file_mounts(
|
42
|
+
self,
|
43
|
+
all_file_mounts: Optional[Dict[Path, Path]],
|
44
|
+
storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
|
45
|
+
) -> None:
|
46
|
+
return self._sync_file_mounts(all_file_mounts, storage_mounts)
|
47
|
+
|
48
|
+
def add_storage_objects(self, task: 'konduktor.Task') -> None:
|
49
|
+
raise NotImplementedError
|
50
|
+
|
51
|
+
def execute(
|
52
|
+
self, task: 'konduktor.Task', detach_run: bool, dryrun: bool = False
|
53
|
+
) -> Optional[str]:
|
54
|
+
"""Execute the task on the cluster.
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
Job id if the task is submitted to the cluster, None otherwise.
|
58
|
+
"""
|
59
|
+
ux_utils.spinner_message('Submitting job')
|
60
|
+
return self._execute(task, detach_run, dryrun)
|
61
|
+
|
62
|
+
def post_execute(self) -> None:
|
63
|
+
"""Post execute(): e.g., print helpful inspection messages."""
|
64
|
+
return self._post_execute()
|
65
|
+
|
66
|
+
def register_info(self, **kwargs) -> None:
|
67
|
+
"""Register backend-specific information."""
|
68
|
+
pass
|
69
|
+
|
70
|
+
def _sync_workdir(self, workdir: Path) -> None:
|
71
|
+
raise NotImplementedError
|
72
|
+
|
73
|
+
def _sync_file_mounts(
|
74
|
+
self,
|
75
|
+
all_file_mounts: Optional[Dict[Path, Path]],
|
76
|
+
storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
|
77
|
+
) -> None:
|
78
|
+
raise NotImplementedError
|
79
|
+
|
80
|
+
def _execute(
|
81
|
+
self, task: 'konduktor.Task', detach_run: bool, dryrun: bool = False
|
82
|
+
) -> Optional[str]:
|
83
|
+
raise NotImplementedError
|
84
|
+
|
85
|
+
def _post_execute(self) -> None:
|
86
|
+
raise NotImplementedError
|
@@ -0,0 +1,218 @@
|
|
1
|
+
"""Batch job execution via k8s jobsets
|
2
|
+
https://jobset.sigs.k8s.io/
|
3
|
+
https://kueue.sigs.k8s.io/docs/tasks/run/jobsets/
|
4
|
+
"""
|
5
|
+
|
6
|
+
import threading
|
7
|
+
import time
|
8
|
+
import typing
|
9
|
+
from typing import Dict, Optional, Tuple
|
10
|
+
|
11
|
+
import colorama
|
12
|
+
|
13
|
+
if typing.TYPE_CHECKING:
|
14
|
+
import konduktor
|
15
|
+
from konduktor.data import storage as storage_lib
|
16
|
+
|
17
|
+
from konduktor import config, logging
|
18
|
+
from konduktor.backends import backend, jobset_utils
|
19
|
+
from konduktor.utils import kubernetes_utils, loki_utils, rich_utils, ux_utils
|
20
|
+
|
21
|
+
Path = str
|
22
|
+
logger = logging.get_logger(__file__)
|
23
|
+
|
24
|
+
POLL_INTERVAL = 5
|
25
|
+
DEFAULT_ATTACH_TIMEOUT = 300 # 5 minutes
|
26
|
+
FLUSH_LOGS_TIMEOUT = 5
|
27
|
+
|
28
|
+
|
29
|
+
class JobsetError(Exception):
|
30
|
+
pass
|
31
|
+
|
32
|
+
|
33
|
+
def _raise_job_error(job):
|
34
|
+
"""Checks a jobs conditions and statuses for error"""
|
35
|
+
for condition in job.status.conditions:
|
36
|
+
if 'ConfigIssue' in condition.message:
|
37
|
+
raise ValueError(
|
38
|
+
'Job failed with '
|
39
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.RED}'
|
40
|
+
f'ConfigIssue: ErrImagePull.{colorama.Style.RESET_ALL} '
|
41
|
+
f'Check that your '
|
42
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
|
43
|
+
f'`image_id`{colorama.Style.RESET_ALL} is correct and '
|
44
|
+
f'your container credentials are correct. Image specified '
|
45
|
+
f'in your task definition is '
|
46
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.RED}'
|
47
|
+
f'`{job.spec.template.spec.containers[0].image}`'
|
48
|
+
f'{colorama.Style.RESET_ALL}'
|
49
|
+
)
|
50
|
+
elif 'BackoffLimitExceeded' == condition.reason:
|
51
|
+
raise JobsetError('Job failed with non-zero exit code.')
|
52
|
+
logger.error(
|
53
|
+
'Job failed with unknown error. Check jobset status in k8s with '
|
54
|
+
f'{colorama.Style.BRIGHT}{colorama.Fore.YELLOW}'
|
55
|
+
f'`kubectl get job -o yaml {job.metadata.name}`'
|
56
|
+
f'{colorama.Style.RESET_ALL}'
|
57
|
+
)
|
58
|
+
|
59
|
+
|
60
|
+
def _wait_for_jobset_start(namespace: str, job_name: str):
|
61
|
+
start = time.time()
|
62
|
+
timeout = config.get_nested(
|
63
|
+
('kubernetes', 'pod_config'),
|
64
|
+
default_value=DEFAULT_ATTACH_TIMEOUT,
|
65
|
+
override_configs={},
|
66
|
+
)
|
67
|
+
|
68
|
+
while True:
|
69
|
+
jobsets = jobset_utils.get_jobset(namespace, job_name)
|
70
|
+
assert jobsets is not None, (
|
71
|
+
f'Jobset {job_name} ' f'not found in namespace {namespace}'
|
72
|
+
)
|
73
|
+
if jobsets['status']['replicatedJobsStatus'][0]['ready']:
|
74
|
+
logger.info(
|
75
|
+
f'task '
|
76
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
77
|
+
f'{colorama.Style.RESET_ALL} ready'
|
78
|
+
)
|
79
|
+
break
|
80
|
+
elif jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
|
81
|
+
return
|
82
|
+
elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
|
83
|
+
logger.info(
|
84
|
+
f'job '
|
85
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
86
|
+
f'{colorama.Style.RESET_ALL} '
|
87
|
+
f'{colorama.Fore.RED}{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
|
88
|
+
)
|
89
|
+
job = jobset_utils.get_job(namespace, job_name)
|
90
|
+
_raise_job_error(job)
|
91
|
+
return
|
92
|
+
if timeout != -1 and time.time() - start > timeout:
|
93
|
+
logger.error(
|
94
|
+
f'{colorama.Style.BRIGHT}'
|
95
|
+
f'{colorama.Fore.RED}Job timed out to schedule.'
|
96
|
+
f'{colorama.Style.RESET_ALL}. Deleting job'
|
97
|
+
)
|
98
|
+
jobset_utils.delete_jobset(namespace, job_name)
|
99
|
+
raise JobsetError(
|
100
|
+
'Job failed to start within '
|
101
|
+
f'timeout of {timeout} seconds. '
|
102
|
+
f'Increase or disable timeout '
|
103
|
+
f'{colorama.Style.BRIGHT}'
|
104
|
+
'`konduktor.provision_timeout: -1`'
|
105
|
+
f'{colorama.Style.RESET_ALL}'
|
106
|
+
)
|
107
|
+
time.sleep(POLL_INTERVAL)
|
108
|
+
|
109
|
+
|
110
|
+
def _wait_for_jobset_completion(namespace: str, job_name: str) -> Tuple[bool, str]:
|
111
|
+
while True:
|
112
|
+
jobsets = jobset_utils.get_jobset(namespace, job_name)
|
113
|
+
assert jobsets is not None, (
|
114
|
+
f'Jobset {job_name} ' f'not found in namespace {namespace}'
|
115
|
+
)
|
116
|
+
if jobsets['status']['replicatedJobsStatus'][0]['succeeded']:
|
117
|
+
msg = (
|
118
|
+
f'task '
|
119
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
120
|
+
f'{colorama.Style.RESET_ALL} {colorama.Fore.GREEN}'
|
121
|
+
f'{colorama.Style.BRIGHT}finished{colorama.Style.RESET_ALL}'
|
122
|
+
)
|
123
|
+
return True, msg
|
124
|
+
elif jobsets['status']['replicatedJobsStatus'][0]['failed']:
|
125
|
+
msg = (
|
126
|
+
f'task '
|
127
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}{job_name}'
|
128
|
+
f'{colorama.Style.RESET_ALL} {colorama.Fore.RED}'
|
129
|
+
f'{colorama.Style.BRIGHT}failed{colorama.Style.RESET_ALL}'
|
130
|
+
)
|
131
|
+
return False, msg
|
132
|
+
time.sleep(POLL_INTERVAL)
|
133
|
+
|
134
|
+
|
135
|
+
class JobsetBackend(backend.Backend):
|
136
|
+
def _sync_file_mounts(
|
137
|
+
self,
|
138
|
+
all_file_mounts: Optional[Dict[Path, Path]],
|
139
|
+
storage_mounts: Optional[Dict[Path, 'storage_lib.Storage']],
|
140
|
+
) -> None:
|
141
|
+
"""Syncs files/directories to cloud storage before job launch.
|
142
|
+
|
143
|
+
This uploads any local files/dirs to cloud storage so they can be downloaded
|
144
|
+
by the pods when they start.
|
145
|
+
"""
|
146
|
+
pass
|
147
|
+
|
148
|
+
def _sync_workdir(self, workdir: str) -> None:
|
149
|
+
"""Syncs the working directory to cloud storage before job launch."""
|
150
|
+
|
151
|
+
pass
|
152
|
+
|
153
|
+
def _post_execute(self) -> None:
|
154
|
+
"""
|
155
|
+
TODO(asaiacai): add some helpful messages/commands that a user can run
|
156
|
+
to inspect the status of their jobset.
|
157
|
+
"""
|
158
|
+
pass
|
159
|
+
|
160
|
+
def _execute(
|
161
|
+
self, task: 'konduktor.Task', detach_run: bool = False, dryrun: bool = False
|
162
|
+
) -> Optional[str]:
|
163
|
+
"""Executes the task on the cluster. By creating a jobset
|
164
|
+
|
165
|
+
Returns:
|
166
|
+
Job id if the task is submitted to the cluster, None otherwise.
|
167
|
+
"""
|
168
|
+
|
169
|
+
# we should consider just building an image with the cloud provider
|
170
|
+
# sdks baked in. These can initialize and pull files first before
|
171
|
+
# the working container starts.
|
172
|
+
|
173
|
+
# first define the pod spec then create the jobset definition
|
174
|
+
pod_spec = jobset_utils.create_pod_spec(task)
|
175
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
176
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
177
|
+
# TODO(asaiacai): need to set env variables in pod
|
178
|
+
jobset_utils.create_jobset(
|
179
|
+
namespace,
|
180
|
+
task,
|
181
|
+
pod_spec['kubernetes']['pod_config'],
|
182
|
+
dryrun=dryrun,
|
183
|
+
)
|
184
|
+
|
185
|
+
if not dryrun and not detach_run:
|
186
|
+
with ux_utils.print_exception_no_traceback():
|
187
|
+
with rich_utils.safe_status(
|
188
|
+
ux_utils.spinner_message('waiting for job to start\n')
|
189
|
+
):
|
190
|
+
_wait_for_jobset_start(namespace, task.name)
|
191
|
+
try:
|
192
|
+
log_thread = threading.Thread(
|
193
|
+
target=loki_utils.tail_loki_logs_ws,
|
194
|
+
args=(task.name,),
|
195
|
+
daemon=True,
|
196
|
+
)
|
197
|
+
logger.info('streaming logs...')
|
198
|
+
log_thread.start()
|
199
|
+
is_success, msg = _wait_for_jobset_completion(namespace, task.name)
|
200
|
+
log_thread.join(timeout=2.0) # give the job sometime to flush logs
|
201
|
+
if not is_success:
|
202
|
+
logger.error(msg)
|
203
|
+
else:
|
204
|
+
logger.info(msg)
|
205
|
+
except KeyboardInterrupt:
|
206
|
+
logger.info('detaching from log stream...')
|
207
|
+
except Exception as err:
|
208
|
+
logger.error(
|
209
|
+
f'Check if job resources are '
|
210
|
+
f'active/queued with '
|
211
|
+
f'{colorama.Style.BRIGHT}'
|
212
|
+
f'`konduktor status`'
|
213
|
+
f'{colorama.Style.RESET_ALL}'
|
214
|
+
)
|
215
|
+
raise JobsetError(f'error: {err}')
|
216
|
+
else:
|
217
|
+
logger.info('detaching from run.')
|
218
|
+
return task.name
|