konduktor-nightly 0.1.0.dev20250327104656__py3-none-any.whl → 0.1.0.dev20250328104606__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +3 -2
- konduktor/adaptors/gcp.py +43 -4
- konduktor/backends/jobset_utils.py +4 -3
- konduktor/check.py +8 -16
- konduktor/data/constants.py +0 -5
- konduktor/data/gcp/__init__.py +2 -2
- konduktor/data/gcp/gcs.py +80 -0
- konduktor/data/registry.py +18 -0
- konduktor/data/storage.py +3 -3
- konduktor/data/storage_utils.py +35 -0
- konduktor/execution.py +2 -2
- konduktor/kube_client.py +1 -1
- konduktor/templates/pod.yaml.j2 +1 -1
- konduktor/utils/schemas.py +5 -4
- {konduktor_nightly-0.1.0.dev20250327104656.dist-info → konduktor_nightly-0.1.0.dev20250328104606.dist-info}/METADATA +2 -1
- {konduktor_nightly-0.1.0.dev20250327104656.dist-info → konduktor_nightly-0.1.0.dev20250328104606.dist-info}/RECORD +19 -19
- konduktor/cloud_stores.py +0 -158
- {konduktor_nightly-0.1.0.dev20250327104656.dist-info → konduktor_nightly-0.1.0.dev20250328104606.dist-info}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20250327104656.dist-info → konduktor_nightly-0.1.0.dev20250328104606.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250327104656.dist-info → konduktor_nightly-0.1.0.dev20250328104606.dist-info}/entry_points.txt +0 -0
konduktor/__init__.py
CHANGED
@@ -14,7 +14,8 @@ __all__ = [
|
|
14
14
|
]
|
15
15
|
|
16
16
|
# Replaced with the current commit when building the wheels.
|
17
|
-
_KONDUKTOR_COMMIT_SHA = '
|
17
|
+
_KONDUKTOR_COMMIT_SHA = '139a1d21db826f4beb908a929f2a02c514e1adef'
|
18
|
+
os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
|
18
19
|
|
19
20
|
|
20
21
|
def _get_git_commit():
|
@@ -47,5 +48,5 @@ def _get_git_commit():
|
|
47
48
|
|
48
49
|
|
49
50
|
__commit__ = _get_git_commit()
|
50
|
-
__version__ = '1.0.0.dev0.1.0.
|
51
|
+
__version__ = '1.0.0.dev0.1.0.dev20250328104606'
|
51
52
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
konduktor/adaptors/gcp.py
CHANGED
@@ -12,13 +12,16 @@
|
|
12
12
|
|
13
13
|
"""GCP cloud adaptors"""
|
14
14
|
|
15
|
-
# pylint: disable=import-outside-toplevel
|
16
15
|
import json
|
16
|
+
import os
|
17
|
+
from functools import wraps
|
18
|
+
|
19
|
+
from filelock import FileLock
|
17
20
|
|
18
21
|
from konduktor.adaptors import common
|
19
22
|
|
20
23
|
_IMPORT_ERROR_MESSAGE = (
|
21
|
-
'Failed to import dependencies for GCP. ' 'Try pip install "
|
24
|
+
'Failed to import dependencies for GCP. ' 'Try pip install "konduktor[gcp]"'
|
22
25
|
)
|
23
26
|
googleapiclient = common.LazyImport(
|
24
27
|
'googleapiclient', import_error_message=_IMPORT_ERROR_MESSAGE
|
@@ -27,6 +30,34 @@ google = common.LazyImport('google', import_error_message=_IMPORT_ERROR_MESSAGE)
|
|
27
30
|
_LAZY_MODULES = (google, googleapiclient)
|
28
31
|
|
29
32
|
|
33
|
+
class LockedClientProxy:
|
34
|
+
"""Proxy for GCP client that locks access to the client."""
|
35
|
+
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
client,
|
39
|
+
lock_path=os.path.expanduser('~/.konduktor/gcs_storage.lock'),
|
40
|
+
timeout=10,
|
41
|
+
):
|
42
|
+
self._client = client
|
43
|
+
self._lock = FileLock(lock_path, timeout=timeout)
|
44
|
+
|
45
|
+
def __getattr__(self, attr):
|
46
|
+
target = getattr(self._client, attr)
|
47
|
+
|
48
|
+
if callable(target):
|
49
|
+
|
50
|
+
@wraps(target)
|
51
|
+
def locked_method(*args, **kwargs):
|
52
|
+
with self._lock:
|
53
|
+
return target(*args, **kwargs)
|
54
|
+
|
55
|
+
return locked_method
|
56
|
+
else:
|
57
|
+
# Attribute (not method) access just passes through
|
58
|
+
return target
|
59
|
+
|
60
|
+
|
30
61
|
@common.load_lazy_modules(_LAZY_MODULES)
|
31
62
|
def build(service_name: str, version: str, *args, **kwargs):
|
32
63
|
"""Build a GCP service.
|
@@ -44,7 +75,7 @@ def storage_client():
|
|
44
75
|
"""Helper that connects to GCS Storage Client for GCS Bucket"""
|
45
76
|
from google.cloud import storage
|
46
77
|
|
47
|
-
return storage.Client()
|
78
|
+
return LockedClientProxy(storage.Client())
|
48
79
|
|
49
80
|
|
50
81
|
@common.load_lazy_modules(_LAZY_MODULES)
|
@@ -52,7 +83,7 @@ def anonymous_storage_client():
|
|
52
83
|
"""Helper that connects to GCS Storage Client for Public GCS Buckets"""
|
53
84
|
from google.cloud import storage
|
54
85
|
|
55
|
-
return storage.Client.create_anonymous_client()
|
86
|
+
return LockedClientProxy(storage.Client.create_anonymous_client())
|
56
87
|
|
57
88
|
|
58
89
|
@common.load_lazy_modules(_LAZY_MODULES)
|
@@ -71,6 +102,14 @@ def forbidden_exception():
|
|
71
102
|
return gcs_exceptions.Forbidden
|
72
103
|
|
73
104
|
|
105
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
106
|
+
def conflict_exception():
|
107
|
+
"""Conflict exception."""
|
108
|
+
from google.api_core import exceptions as gcs_exceptions
|
109
|
+
|
110
|
+
return gcs_exceptions.Conflict
|
111
|
+
|
112
|
+
|
74
113
|
@common.load_lazy_modules(_LAZY_MODULES)
|
75
114
|
def http_error_exception():
|
76
115
|
"""HttpError exception."""
|
@@ -12,7 +12,8 @@ from urllib.parse import urlparse
|
|
12
12
|
import colorama
|
13
13
|
|
14
14
|
import konduktor
|
15
|
-
from konduktor import
|
15
|
+
from konduktor import constants, kube_client, logging
|
16
|
+
from konduktor.data import registry
|
16
17
|
from konduktor.utils import common_utils, kubernetes_utils, log_utils
|
17
18
|
|
18
19
|
if typing.TYPE_CHECKING:
|
@@ -102,7 +103,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
102
103
|
if '/tmp/konduktor-job-filemounts-files' in dst:
|
103
104
|
continue
|
104
105
|
# should impelement a method here instead of raw dog dict access
|
105
|
-
cloud_store =
|
106
|
+
cloud_store = registry._REGISTRY[store_scheme]
|
106
107
|
storage_secrets[store_scheme] = cloud_store._STORE.get_k8s_credential_name()
|
107
108
|
mkdir_commands.append(
|
108
109
|
f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};' f'mkdir -p {dst}'
|
@@ -118,7 +119,7 @@ def create_pod_spec(task: 'konduktor.Task') -> Dict[str, Any]:
|
|
118
119
|
assert task.file_mounts is not None
|
119
120
|
for dst, src in task.file_mounts.items():
|
120
121
|
store_scheme = str(urlparse(store.source).scheme)
|
121
|
-
cloud_store =
|
122
|
+
cloud_store = registry._REGISTRY[store_scheme]
|
122
123
|
mkdir_commands.append(
|
123
124
|
f'cd {constants.KONDUKTOR_REMOTE_WORKDIR};'
|
124
125
|
f'mkdir -p {os.path.dirname(dst)}'
|
konduktor/check.py
CHANGED
@@ -34,12 +34,13 @@ from typing import Iterable, List, Optional, Tuple
|
|
34
34
|
import click
|
35
35
|
import colorama
|
36
36
|
|
37
|
-
from konduktor import cloud_stores, logging
|
38
37
|
from konduktor import config as konduktor_config
|
38
|
+
from konduktor import logging
|
39
|
+
from konduktor.data import registry
|
39
40
|
from konduktor.utils import rich_utils
|
40
41
|
|
41
42
|
if typing.TYPE_CHECKING:
|
42
|
-
|
43
|
+
from konduktor.data import storage_utils
|
43
44
|
|
44
45
|
logger = logging.get_logger(__name__)
|
45
46
|
|
@@ -58,7 +59,7 @@ def check(
|
|
58
59
|
disabled_clouds = []
|
59
60
|
|
60
61
|
def check_one_cloud(
|
61
|
-
cloud_tuple: Tuple[str, '
|
62
|
+
cloud_tuple: Tuple[str, 'storage_utils.CloudStorage'],
|
62
63
|
) -> None:
|
63
64
|
cloud_repr, cloud = cloud_tuple
|
64
65
|
with rich_utils.safe_status(f'Checking {cloud_repr}...'):
|
@@ -80,15 +81,15 @@ def check(
|
|
80
81
|
disabled_clouds.append(cloud_repr)
|
81
82
|
echo(f' Reason: {reason}')
|
82
83
|
|
83
|
-
def get_cloud_tuple(cloud_name: str) -> Tuple[str, '
|
84
|
+
def get_cloud_tuple(cloud_name: str) -> Tuple[str, 'storage_utils.CloudStorage']:
|
84
85
|
# Validates cloud_name and returns a tuple of the cloud's name and
|
85
86
|
# the cloud object. Includes special handling for Cloudflare.
|
86
|
-
cloud_obj =
|
87
|
+
cloud_obj = registry._REGISTRY.get(cloud_name, None)
|
87
88
|
assert cloud_obj is not None, f'Cloud {cloud_name!r} not found'
|
88
89
|
return cloud_name, cloud_obj
|
89
90
|
|
90
91
|
def get_all_clouds():
|
91
|
-
return tuple([c for c in
|
92
|
+
return tuple([c for c in registry._REGISTRY.keys()])
|
92
93
|
|
93
94
|
if clouds is not None:
|
94
95
|
cloud_list = clouds
|
@@ -158,7 +159,7 @@ def check(
|
|
158
159
|
+ '\n'
|
159
160
|
+ click.style(
|
160
161
|
'If any problems remain, refer to detailed docs at: '
|
161
|
-
'https://
|
162
|
+
'https://trainy.mintlify.app', # pylint: disable=line-too-long
|
162
163
|
dim=True,
|
163
164
|
)
|
164
165
|
)
|
@@ -179,14 +180,5 @@ def check(
|
|
179
180
|
return enabled_clouds
|
180
181
|
|
181
182
|
|
182
|
-
# === Helper functions ===
|
183
|
-
def storage_in_iterable(
|
184
|
-
cloud: 'cloud_stores.GcsCloudStorage',
|
185
|
-
cloud_list: Iterable['cloud_stores.GcsCloudStorage'],
|
186
|
-
) -> bool:
|
187
|
-
"""Returns whether the cloud is in the given cloud list."""
|
188
|
-
return any(cloud == c for c in cloud_list)
|
189
|
-
|
190
|
-
|
191
183
|
def _format_enabled_storage(cloud_name: str) -> str:
|
192
184
|
return f'{colorama.Fore.GREEN}{cloud_name}{colorama.Style.RESET_ALL}'
|
konduktor/data/constants.py
CHANGED
@@ -4,9 +4,4 @@ Path = str
|
|
4
4
|
SourceType = Union[Path]
|
5
5
|
StorageHandle = Any
|
6
6
|
|
7
|
-
# TODO(asaiacai) This should match the cloud store
|
8
|
-
# classes in cloud_stores.py,
|
9
|
-
# should honestly just use one or the other instead of both
|
10
|
-
STORE_ENABLED_CLOUDS = ['gs']
|
11
|
-
|
12
7
|
_STORAGE_LOG_FILE_NAME = 'storage.log'
|
konduktor/data/gcp/__init__.py
CHANGED
@@ -1,18 +1,18 @@
|
|
1
1
|
"""Data sync between workstation <--> blob (s3, gcs, etc.) <--> worker pods"""
|
2
2
|
|
3
3
|
from konduktor.data.gcp.constants import (
|
4
|
-
DEFAULT_SERVICE_ACCOUNT_ROLES,
|
5
4
|
STORAGE_MINIMAL_PERMISSIONS,
|
6
5
|
)
|
7
6
|
from konduktor.data.gcp.gcs import (
|
8
7
|
DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH,
|
9
8
|
GOOGLE_SDK_INSTALLATION_COMMAND,
|
9
|
+
GcsCloudStorage,
|
10
10
|
GcsStore,
|
11
11
|
)
|
12
12
|
|
13
13
|
__all__ = [
|
14
14
|
'GcsStore',
|
15
|
-
'
|
15
|
+
'GcsCloudStorage',
|
16
16
|
'STORAGE_MINIMAL_PERMISSIONS',
|
17
17
|
'GOOGLE_SDK_INSTALLATION_COMMAND',
|
18
18
|
'DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH',
|
konduktor/data/gcp/gcs.py
CHANGED
@@ -578,6 +578,10 @@ class GcsStore(storage_utils.AbstractStore):
|
|
578
578
|
bucket = self.client.bucket(bucket_name)
|
579
579
|
bucket.storage_class = 'STANDARD'
|
580
580
|
new_bucket = self.client.create_bucket(bucket, location=region)
|
581
|
+
except gcp.conflict_exception():
|
582
|
+
# it's fine to pass this exception since
|
583
|
+
# this means the bucket already exists
|
584
|
+
pass
|
581
585
|
except Exception as e: # pylint: disable=broad-except
|
582
586
|
with ux_utils.print_exception_no_traceback():
|
583
587
|
raise exceptions.StorageBucketCreateError(
|
@@ -904,3 +908,79 @@ class GcsStore(storage_utils.AbstractStore):
|
|
904
908
|
@classmethod
|
905
909
|
def get_k8s_credential_name(cls) -> str:
|
906
910
|
return cls._GCP_SECRET_NAME
|
911
|
+
|
912
|
+
|
913
|
+
class GcsCloudStorage(storage_utils.CloudStorage):
|
914
|
+
"""Google Cloud Storage."""
|
915
|
+
|
916
|
+
# We use gsutil as a basic implementation. One pro is that its -m
|
917
|
+
# multi-threaded download is nice, which frees us from implementing
|
918
|
+
# parellel workers on our end.
|
919
|
+
# The gsutil command is part of the Google Cloud SDK, and we reuse
|
920
|
+
# the installation logic here.
|
921
|
+
_INSTALL_GSUTIL = GOOGLE_SDK_INSTALLATION_COMMAND
|
922
|
+
_STORE: typing.Type[storage_utils.AbstractStore] = GcsStore
|
923
|
+
|
924
|
+
@property
|
925
|
+
def _gsutil_command(self):
|
926
|
+
gsutil_alias, alias_gen = data_utils.get_gsutil_command()
|
927
|
+
return (
|
928
|
+
f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS='
|
929
|
+
f'{DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH}; '
|
930
|
+
# Explicitly activate service account. Unlike the gcp packages
|
931
|
+
# and other GCP commands, gsutil does not automatically pick up
|
932
|
+
# the default credential keys when it is a service account.
|
933
|
+
'gcloud auth activate-service-account '
|
934
|
+
'--key-file=$GOOGLE_APPLICATION_CREDENTIALS '
|
935
|
+
'2> /dev/null || true; '
|
936
|
+
f'{gsutil_alias}'
|
937
|
+
)
|
938
|
+
|
939
|
+
def is_directory(self, url: str) -> bool:
|
940
|
+
"""Returns whether 'url' is a directory.
|
941
|
+
In cloud object stores, a "directory" refers to a regular object whose
|
942
|
+
name is a prefix of other objects.
|
943
|
+
"""
|
944
|
+
commands = [self._INSTALL_GSUTIL]
|
945
|
+
commands.append(f'{self._gsutil_command} ls -d {url}')
|
946
|
+
command = ' && '.join(commands)
|
947
|
+
p = subprocess.run(
|
948
|
+
command,
|
949
|
+
stdout=subprocess.PIPE,
|
950
|
+
shell=True,
|
951
|
+
check=True,
|
952
|
+
executable='/bin/bash',
|
953
|
+
)
|
954
|
+
out = p.stdout.decode().strip()
|
955
|
+
# Edge Case: Gcloud command is run for first time #437
|
956
|
+
out = out.split('\n')[-1]
|
957
|
+
# If <url> is a bucket root, then we only need `gsutil` to succeed
|
958
|
+
# to make sure the bucket exists. It is already a directory.
|
959
|
+
_, key = data_utils.split_gcs_path(url)
|
960
|
+
if not key:
|
961
|
+
return True
|
962
|
+
# Otherwise, gsutil ls -d url will return:
|
963
|
+
# --> url.rstrip('/') if url is not a directory
|
964
|
+
# --> url with an ending '/' if url is a directory
|
965
|
+
if not out.endswith('/'):
|
966
|
+
assert out == url.rstrip('/'), (out, url)
|
967
|
+
return False
|
968
|
+
url = url if url.endswith('/') else (url + '/')
|
969
|
+
assert out == url, (out, url)
|
970
|
+
return True
|
971
|
+
|
972
|
+
def make_sync_dir_command(self, source: str, destination: str) -> str:
|
973
|
+
"""Downloads a directory using gsutil."""
|
974
|
+
download_via_gsutil = (
|
975
|
+
f'{self._gsutil_command} ' f'rsync -e -r {source} {destination}'
|
976
|
+
)
|
977
|
+
all_commands = [self._INSTALL_GSUTIL]
|
978
|
+
all_commands.append(download_via_gsutil)
|
979
|
+
return ' && '.join(all_commands)
|
980
|
+
|
981
|
+
def make_sync_file_command(self, source: str, destination: str) -> str:
|
982
|
+
"""Downloads a file using gsutil."""
|
983
|
+
download_via_gsutil = f'{self._gsutil_command} ' f'cp {source} {destination}'
|
984
|
+
all_commands = [self._INSTALL_GSUTIL]
|
985
|
+
all_commands.append(download_via_gsutil)
|
986
|
+
return ' && '.join(all_commands)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
from konduktor.data.gcp import GcsCloudStorage
|
2
|
+
|
3
|
+
# Maps bucket's URIs prefix(scheme) to its corresponding storage class
|
4
|
+
|
5
|
+
_REGISTRY = {
|
6
|
+
'gs': GcsCloudStorage(),
|
7
|
+
# TODO(asaiacai): Add other cloud stores here
|
8
|
+
# 's3': S3CloudStorage(),
|
9
|
+
# 'r2': R2CloudStorage(),
|
10
|
+
# 'cos': IBMCosCloudStorage(),
|
11
|
+
# 'oci': OciCloudStorage(),
|
12
|
+
# # TODO: This is a hack, as Azure URL starts with https://, we should
|
13
|
+
# # refactor the registry to be able to take regex, so that Azure blob can
|
14
|
+
# # be identified with `https://(.*?)\.blob\.core\.windows\.net`
|
15
|
+
# 'https': AzureBlobCloudStorage()
|
16
|
+
}
|
17
|
+
|
18
|
+
_STORE_ENABLED_CLOUDS = list(_REGISTRY.keys())
|
konduktor/data/storage.py
CHANGED
@@ -28,7 +28,7 @@ import urllib.parse
|
|
28
28
|
from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
|
29
29
|
|
30
30
|
from konduktor import check, config, logging
|
31
|
-
from konduktor.data import constants, data_utils, gcp, storage_utils
|
31
|
+
from konduktor.data import constants, data_utils, gcp, registry, storage_utils
|
32
32
|
from konduktor.utils import annotations, common_utils, exceptions, schemas, ux_utils
|
33
33
|
|
34
34
|
logger = logging.get_logger(__file__)
|
@@ -40,13 +40,13 @@ def get_cached_enabled_storage_clouds_or_refresh(
|
|
40
40
|
) -> List[str]:
|
41
41
|
# This is a temporary solution until https://github.com/skypilot-org/skypilot/issues/1943 # noqa: E501
|
42
42
|
# (asaiacai): This function does not do any actual checking right now.
|
43
|
-
# this is temporary.In the future, we can cache to disk.
|
43
|
+
# this is temporary. In the future, we can cache to disk.
|
44
44
|
# For now, we just print a warning to the user saying what
|
45
45
|
# clouds are enabled and if the task fails to run `konduktor check`
|
46
46
|
# to update the credentials.
|
47
47
|
enabled_clouds = config.get_nested(('allowed_clouds',), [])
|
48
48
|
if len(enabled_clouds) == 0:
|
49
|
-
enabled_clouds =
|
49
|
+
enabled_clouds = registry._STORE_ENABLED_CLOUDS
|
50
50
|
else:
|
51
51
|
enabled_clouds = [str(cloud) for cloud in enabled_clouds]
|
52
52
|
logger.warning(
|
konduktor/data/storage_utils.py
CHANGED
@@ -498,3 +498,38 @@ class AbstractStore:
|
|
498
498
|
def get_k8s_credential_name(cls) -> str:
|
499
499
|
"""Returns the name of the k8s secret storing the credentials for the store."""
|
500
500
|
raise NotImplementedError
|
501
|
+
|
502
|
+
|
503
|
+
class CloudStorage:
|
504
|
+
"""Interface for a cloud object store."""
|
505
|
+
|
506
|
+
# this needs to be overridden by the subclass
|
507
|
+
_STORE: typing.Type[AbstractStore]
|
508
|
+
|
509
|
+
def is_directory(self, url: str) -> bool:
|
510
|
+
"""Returns whether 'url' is a directory.
|
511
|
+
|
512
|
+
In cloud object stores, a "directory" refers to a regular object whose
|
513
|
+
name is a prefix of other objects.
|
514
|
+
"""
|
515
|
+
raise NotImplementedError
|
516
|
+
|
517
|
+
def make_sync_dir_command(self, source: str, destination: str) -> str:
|
518
|
+
"""Makes a runnable bash command to sync a 'directory'."""
|
519
|
+
raise NotImplementedError
|
520
|
+
|
521
|
+
def make_sync_file_command(self, source: str, destination: str) -> str:
|
522
|
+
"""Makes a runnable bash command to sync a file."""
|
523
|
+
raise NotImplementedError
|
524
|
+
|
525
|
+
def check_credentials(self):
|
526
|
+
"""Checks if the user has access credentials to this cloud."""
|
527
|
+
return self._STORE.check_credentials()
|
528
|
+
|
529
|
+
def check_credentials_from_secret(self):
|
530
|
+
"""Checks if the user has access credentials to this cloud."""
|
531
|
+
return self._STORE.check_credentials_from_secret()
|
532
|
+
|
533
|
+
def set_secret_credentials(self):
|
534
|
+
"""Set the credentials from the secret"""
|
535
|
+
return self._STORE.set_secret_credentials()
|
konduktor/execution.py
CHANGED
@@ -13,8 +13,8 @@ if typing.TYPE_CHECKING:
|
|
13
13
|
from konduktor import config, constants
|
14
14
|
from konduktor import logging as konduktor_logging
|
15
15
|
from konduktor.backends import JobsetBackend
|
16
|
-
from konduktor.data import constants as storage_constants
|
17
16
|
from konduktor.data import data_utils
|
17
|
+
from konduktor.data import registry as storage_registry
|
18
18
|
from konduktor.data import storage as storage_lib
|
19
19
|
from konduktor.utils import common_utils, exceptions, rich_utils, ux_utils
|
20
20
|
|
@@ -345,7 +345,7 @@ def maybe_translate_local_file_mounts_and_sync_up(
|
|
345
345
|
data_src += ' and workdir'
|
346
346
|
else:
|
347
347
|
data_src = 'workdir'
|
348
|
-
store_enabled_clouds = ', '.join(
|
348
|
+
store_enabled_clouds = ', '.join(storage_registry._STORE_ENABLED_CLOUDS)
|
349
349
|
with ux_utils.print_exception_no_traceback():
|
350
350
|
raise exceptions.NotSupportedError(
|
351
351
|
f'Unable to use {data_src} - no cloud with object '
|
konduktor/kube_client.py
CHANGED
@@ -18,7 +18,7 @@ DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
|
|
18
18
|
# for Kubernetes clusters. This is used to associate a name with the current
|
19
19
|
# context when running with in-cluster auth. If not set, the context name is
|
20
20
|
# set to DEFAULT_IN_CLUSTER_REGION.
|
21
|
-
IN_CLUSTER_CONTEXT_NAME_ENV_VAR = '
|
21
|
+
IN_CLUSTER_CONTEXT_NAME_ENV_VAR = 'KONDUKTOR_IN_CLUSTER_CONTEXT_NAME'
|
22
22
|
|
23
23
|
|
24
24
|
def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
|
konduktor/templates/pod.yaml.j2
CHANGED
@@ -85,7 +85,7 @@ kubernetes:
|
|
85
85
|
PACKAGES="$PACKAGES git";
|
86
86
|
{% endif %}
|
87
87
|
|
88
|
-
if [ -z "${PACKAGES}" ]; then
|
88
|
+
if [ ! -z "${PACKAGES}" ]; then
|
89
89
|
# Run apt update, install missing packages
|
90
90
|
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > ~/.konduktor/tmp/apt-update.log 2>&1 || \
|
91
91
|
$(prefix_cmd) echo "Warning: apt-get update failed. Continuing anyway..." >> ~/.konduktor/tmp/apt-update.log
|
konduktor/utils/schemas.py
CHANGED
@@ -454,7 +454,8 @@ _REMOTE_IDENTITY_SCHEMA_KUBERNETES = {
|
|
454
454
|
|
455
455
|
def get_storage_schema():
|
456
456
|
# pylint: disable=import-outside-toplevel
|
457
|
-
from
|
457
|
+
from knoduktor.registry import registry
|
458
|
+
|
458
459
|
from konduktor.data import storage
|
459
460
|
|
460
461
|
return {
|
@@ -476,7 +477,7 @@ def get_storage_schema():
|
|
476
477
|
},
|
477
478
|
'store': {
|
478
479
|
'type': 'string',
|
479
|
-
'case_insensitive_enum': [type for type in
|
480
|
+
'case_insensitive_enum': [type for type in registry._REGISTRY],
|
480
481
|
},
|
481
482
|
'persistent': {
|
482
483
|
'type': 'boolean',
|
@@ -497,7 +498,7 @@ def get_storage_schema():
|
|
497
498
|
|
498
499
|
def get_config_schema():
|
499
500
|
# pylint: disable=import-outside-toplevel
|
500
|
-
from konduktor import
|
501
|
+
from konduktor.data import registry
|
501
502
|
from konduktor.utils import kubernetes_enums
|
502
503
|
|
503
504
|
cloud_configs = {
|
@@ -547,7 +548,7 @@ def get_config_schema():
|
|
547
548
|
'required': ['items'],
|
548
549
|
'items': {
|
549
550
|
'type': 'string',
|
550
|
-
'case_insensitive_enum': (list(
|
551
|
+
'case_insensitive_enum': (list(registry._REGISTRY.keys())),
|
551
552
|
},
|
552
553
|
}
|
553
554
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: konduktor-nightly
|
3
|
-
Version: 0.1.0.
|
3
|
+
Version: 0.1.0.dev20250328104606
|
4
4
|
Summary: GPU Cluster Health Management
|
5
5
|
Author: Andrew Aikawa
|
6
6
|
Author-email: asai@berkeley.edu
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.12
|
13
13
|
Requires-Dist: click (>=8.1.7,<9.0.0)
|
14
14
|
Requires-Dist: colorama (>=0.4.6,<0.5.0)
|
15
|
+
Requires-Dist: filelock (>=3.18.0,<4.0.0)
|
15
16
|
Requires-Dist: google-api-python-client[gcp] (>=2.161.0,<3.0.0)
|
16
17
|
Requires-Dist: google-cloud-storage[gcp] (>=3.0.0,<4.0.0)
|
17
18
|
Requires-Dist: jinja2 (>=3.1.5,<4.0.0)
|
@@ -1,14 +1,13 @@
|
|
1
|
-
konduktor/__init__.py,sha256=
|
1
|
+
konduktor/__init__.py,sha256=2U2pj_Uu1x76859aRUOI6UapnthXIO_rYCUUhV4K8oQ,1540
|
2
2
|
konduktor/adaptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
konduktor/adaptors/common.py,sha256=mYb_6c3u5MghtiFfiW5OO-EH6t7cIR5npbkgUmz6FYE,3517
|
4
|
-
konduktor/adaptors/gcp.py,sha256=
|
4
|
+
konduktor/adaptors/gcp.py,sha256=pOQA2q8fFyr97Htn8EqvNM0XT-Ao8UwvExviiLaDats,4746
|
5
5
|
konduktor/backends/__init__.py,sha256=1Q6sqqdeMYarpTX_U-QVywJYf7idiUTRsyP-E4BQSOw,129
|
6
6
|
konduktor/backends/backend.py,sha256=qh0bp94lzoTYZkzyQv2-CVrB5l91FkG2vclXg24UFC0,2910
|
7
7
|
konduktor/backends/jobset.py,sha256=lh_PihQgM0tmVryCpjSsZjWug8hBnJr7ua9lqk0qEAM,8251
|
8
|
-
konduktor/backends/jobset_utils.py,sha256=
|
9
|
-
konduktor/check.py,sha256=
|
8
|
+
konduktor/backends/jobset_utils.py,sha256=5xeFPNN724SX6M4p4Koump5yCcdRJdiSUs03TexSwMQ,17236
|
9
|
+
konduktor/check.py,sha256=JennyWoaqSKhdyfUldd266KwVXTPJpcYQa4EED4a_BA,7569
|
10
10
|
konduktor/cli.py,sha256=90bnh3nIobfBkzqS_SXgw9Z8Zqh4ouwpLDj0kx_6kL8,23562
|
11
|
-
konduktor/cloud_stores.py,sha256=KX3u5YlXGslMCe_q8zYtFy62_KGCmmLTrYuK7Y9jFIM,6277
|
12
11
|
konduktor/config.py,sha256=J50JxC6MsXMnlrJPXdDUMr38C89xvOO7mR8KJ6fyils,15520
|
13
12
|
konduktor/constants.py,sha256=T3AeXXxuQHINW_bAWyztvDeS8r4g8kXBGIwIq13cys0,1814
|
14
13
|
konduktor/controller/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -49,16 +48,17 @@ konduktor/dashboard/frontend/postcss.config.mjs,sha256=rDHiqV72T-J860Ek4QFnUnMQe
|
|
49
48
|
konduktor/dashboard/frontend/server.js,sha256=jcp6_Ww9YJD3uKY07jR3KMlAM6n1QZdxZnVY6Kh-J6A,1789
|
50
49
|
konduktor/dashboard/frontend/tailwind.config.js,sha256=fCnc48wvioIDOe5ldQ_6RE7F76cP7aU7pDrxBPJx-Fk,366
|
51
50
|
konduktor/data/__init__.py,sha256=KMR2i3E9YcIpiIuCxtRdS7BQ1w2vUAbbve7agziJrLo,213
|
52
|
-
konduktor/data/constants.py,sha256=
|
51
|
+
konduktor/data/constants.py,sha256=yXVEoTI2we1xOjVSU-bjRCQCLpVvpEvJ0GedXvSwEfw,127
|
53
52
|
konduktor/data/data_utils.py,sha256=aIv3q2H1GSiN2w8WNjZgVaglm-hoiHSb4KR-MAiKKXs,8383
|
54
|
-
konduktor/data/gcp/__init__.py,sha256=
|
53
|
+
konduktor/data/gcp/__init__.py,sha256=rlQxACBC_Vu36mdgPyJgUy4mGc_6Nt_a96JAuaPz2pQ,489
|
55
54
|
konduktor/data/gcp/constants.py,sha256=dMfOiFccM8O6rUi9kClJcbvw1K1VnS1JzzQk3apq8ho,1483
|
56
|
-
konduktor/data/gcp/gcs.py,sha256=
|
55
|
+
konduktor/data/gcp/gcs.py,sha256=kDbUzf8ALYzsw_G3sBRn_enQ8fjI-UKV0jeWuFZiULA,42018
|
57
56
|
konduktor/data/gcp/utils.py,sha256=FJQcMXZqtMIzjZ98b3lTTc0UbdPUKTDLsOsfJaaH5-s,214
|
58
|
-
konduktor/data/
|
59
|
-
konduktor/data/
|
60
|
-
konduktor/
|
61
|
-
konduktor/
|
57
|
+
konduktor/data/registry.py,sha256=eLs8Wr5ugwOfXGPtg1utTvGIqdbVLsCf-a3PFS1NELc,652
|
58
|
+
konduktor/data/storage.py,sha256=sm0ZfGZUZRiChza_jMRQY1xDIWtZpFQqwPuVOF8PM_Y,34742
|
59
|
+
konduktor/data/storage_utils.py,sha256=n4GivkN0KMqmyOTDznF0Z-hzsJvm7KCEh5i5HgFAT-4,20806
|
60
|
+
konduktor/execution.py,sha256=UaHUdBmDaIYgiAXkRKJQOHniYPVIR4sr4yUbIqpgMrQ,18401
|
61
|
+
konduktor/kube_client.py,sha256=aqwjDfNSneB5NOxV6CtqhkBeNl0UQNUt730R3ujG9Ow,6156
|
62
62
|
konduktor/logging.py,sha256=mBCoCTNhDEkUxd4tsse4mw-aVzSGohhXYf16ViR0ch4,2722
|
63
63
|
konduktor/manifests/controller_deployment.yaml,sha256=6p3oSLkEVONZsvKZGqVop0Dhn4bo3lrigRmhf8NXBHE,1730
|
64
64
|
konduktor/manifests/dashboard_deployment.yaml,sha256=xJLd4FbPMAosI0fIv5_8y7dV9bw0Vsf81l-w4MB_aU8,2837
|
@@ -67,7 +67,7 @@ konduktor/manifests/pod_cleanup_controller.yaml,sha256=hziL1Ka1kCAEL9R7Tjvpb80iw
|
|
67
67
|
konduktor/resource.py,sha256=68z8gC8Ivqktwv0R6ylMn9ZNocgkcRT0yIRGGKOdwcM,18491
|
68
68
|
konduktor/task.py,sha256=edHgMLYECGux6WLCilqsNZNYr3dEcw_miWvu4FYpu5U,34713
|
69
69
|
konduktor/templates/jobset.yaml.j2,sha256=onYiHtXAgk-XBtji994hPu_g0hxnLzvmfxwjbdKdeZc,960
|
70
|
-
konduktor/templates/pod.yaml.j2,sha256=
|
70
|
+
konduktor/templates/pod.yaml.j2,sha256=rO5rDfM2XYCRbc5tD0bXzIK_ulZGlVISZdQdJzr-Gfk,8435
|
71
71
|
konduktor/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
72
72
|
konduktor/usage/constants.py,sha256=gCL8afIHZhO0dcxbJGpESE9sCC1cBSbeRnQ8GwNOY4M,612
|
73
73
|
konduktor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -83,12 +83,12 @@ konduktor/utils/kubernetes_utils.py,sha256=NGBredKPWpZC8VNlwTfWLhHnc-p68d5xlxT-0
|
|
83
83
|
konduktor/utils/log_utils.py,sha256=lgHCq4OdtJNfbpso-uYGONUCVNsUrUkUWjROarsHt6s,9897
|
84
84
|
konduktor/utils/loki_utils.py,sha256=ND1pbbbFhLhLKw3870j44LpR_9MB0EkDJSs5K7nWdY4,3473
|
85
85
|
konduktor/utils/rich_utils.py,sha256=kdjNe6S2LlpOxyzhFHqMzCz7g4ROC4e7TPWgcbRsrQE,3577
|
86
|
-
konduktor/utils/schemas.py,sha256=
|
86
|
+
konduktor/utils/schemas.py,sha256=Gv7SEhFpv-eO5izqRz8d-eQ9z-lVmY05akm6HEXIIdc,17478
|
87
87
|
konduktor/utils/subprocess_utils.py,sha256=WoFkoFhGecPR8-rF8WJxbIe-YtV94LXz9UG64SDhCY4,9448
|
88
88
|
konduktor/utils/ux_utils.py,sha256=NPNu3Igu2Z9Oq77ghJhy_fIxQZTXWr9BtKyxN3Wslzo,7164
|
89
89
|
konduktor/utils/validator.py,sha256=tgBghVyedyzGx84-U2Qfoh_cJBE3oUk9gclMW90ORks,691
|
90
|
-
konduktor_nightly-0.1.0.
|
91
|
-
konduktor_nightly-0.1.0.
|
92
|
-
konduktor_nightly-0.1.0.
|
93
|
-
konduktor_nightly-0.1.0.
|
94
|
-
konduktor_nightly-0.1.0.
|
90
|
+
konduktor_nightly-0.1.0.dev20250328104606.dist-info/LICENSE,sha256=MuuqTZbHvmqXR_aNKAXzggdV45ANd3wQ5YI7tnpZhm0,6586
|
91
|
+
konduktor_nightly-0.1.0.dev20250328104606.dist-info/METADATA,sha256=AEritN7-smRecfPlFGJTxTtmqRImheYS4fW-KsGCBBo,4112
|
92
|
+
konduktor_nightly-0.1.0.dev20250328104606.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
93
|
+
konduktor_nightly-0.1.0.dev20250328104606.dist-info/entry_points.txt,sha256=k3nG5wDFIJhNqsZWrHk4d0irIB2Ns9s47cjRWYsTCT8,48
|
94
|
+
konduktor_nightly-0.1.0.dev20250328104606.dist-info/RECORD,,
|
konduktor/cloud_stores.py
DELETED
@@ -1,158 +0,0 @@
|
|
1
|
-
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
-
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
-
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
-
# Unless required by applicable law or agreed to in writing, software
|
8
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
-
# See the License for the specific language governing permissions and
|
11
|
-
# limitations under the License.
|
12
|
-
|
13
|
-
"""Cloud object stores.
|
14
|
-
|
15
|
-
Currently, used for transferring data in bulk. Thus, this module does not
|
16
|
-
offer file-level calls (e.g., open, reading, writing).
|
17
|
-
|
18
|
-
TODO:
|
19
|
-
* Better interface.
|
20
|
-
* Better implementation (e.g., fsspec, smart_open, using each cloud's SDK).
|
21
|
-
"""
|
22
|
-
|
23
|
-
import subprocess
|
24
|
-
import typing
|
25
|
-
|
26
|
-
from konduktor import logging
|
27
|
-
from konduktor.data import data_utils, gcp, storage_utils
|
28
|
-
|
29
|
-
logger = logging.get_logger(__name__)
|
30
|
-
|
31
|
-
# TODO(asaiacai): this internal API is shit and should just be unified with
|
32
|
-
# the storage_utils.AbstractStore class. Shit Berkeley EECS as usual.
|
33
|
-
|
34
|
-
|
35
|
-
class CloudStorage:
|
36
|
-
"""Interface for a cloud object store."""
|
37
|
-
|
38
|
-
# this needs to be overridden by the subclass
|
39
|
-
_STORE: typing.Type[storage_utils.AbstractStore]
|
40
|
-
|
41
|
-
def is_directory(self, url: str) -> bool:
|
42
|
-
"""Returns whether 'url' is a directory.
|
43
|
-
|
44
|
-
In cloud object stores, a "directory" refers to a regular object whose
|
45
|
-
name is a prefix of other objects.
|
46
|
-
"""
|
47
|
-
raise NotImplementedError
|
48
|
-
|
49
|
-
def make_sync_dir_command(self, source: str, destination: str) -> str:
|
50
|
-
"""Makes a runnable bash command to sync a 'directory'."""
|
51
|
-
raise NotImplementedError
|
52
|
-
|
53
|
-
def make_sync_file_command(self, source: str, destination: str) -> str:
|
54
|
-
"""Makes a runnable bash command to sync a file."""
|
55
|
-
raise NotImplementedError
|
56
|
-
|
57
|
-
def check_credentials(self):
|
58
|
-
"""Checks if the user has access credentials to this cloud."""
|
59
|
-
return self._STORE.check_credentials()
|
60
|
-
|
61
|
-
def check_credentials_from_secret(self):
|
62
|
-
"""Checks if the user has access credentials to this cloud."""
|
63
|
-
return self._STORE.check_credentials_from_secret()
|
64
|
-
|
65
|
-
def set_secret_credentials(self):
|
66
|
-
"""Set the credentials from the secret"""
|
67
|
-
return self._STORE.set_secret_credentials()
|
68
|
-
|
69
|
-
|
70
|
-
class GcsCloudStorage(CloudStorage):
|
71
|
-
"""Google Cloud Storage."""
|
72
|
-
|
73
|
-
# We use gsutil as a basic implementation. One pro is that its -m
|
74
|
-
# multi-threaded download is nice, which frees us from implementing
|
75
|
-
# parellel workers on our end.
|
76
|
-
# The gsutil command is part of the Google Cloud SDK, and we reuse
|
77
|
-
# the installation logic here.
|
78
|
-
_INSTALL_GSUTIL = gcp.GOOGLE_SDK_INSTALLATION_COMMAND
|
79
|
-
_STORE: typing.Type[storage_utils.AbstractStore] = gcp.GcsStore
|
80
|
-
|
81
|
-
@property
|
82
|
-
def _gsutil_command(self):
|
83
|
-
gsutil_alias, alias_gen = data_utils.get_gsutil_command()
|
84
|
-
return (
|
85
|
-
f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS='
|
86
|
-
f'{gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH}; '
|
87
|
-
# Explicitly activate service account. Unlike the gcp packages
|
88
|
-
# and other GCP commands, gsutil does not automatically pick up
|
89
|
-
# the default credential keys when it is a service account.
|
90
|
-
'gcloud auth activate-service-account '
|
91
|
-
'--key-file=$GOOGLE_APPLICATION_CREDENTIALS '
|
92
|
-
'2> /dev/null || true; '
|
93
|
-
f'{gsutil_alias}'
|
94
|
-
)
|
95
|
-
|
96
|
-
def is_directory(self, url: str) -> bool:
|
97
|
-
"""Returns whether 'url' is a directory.
|
98
|
-
In cloud object stores, a "directory" refers to a regular object whose
|
99
|
-
name is a prefix of other objects.
|
100
|
-
"""
|
101
|
-
commands = [self._INSTALL_GSUTIL]
|
102
|
-
commands.append(f'{self._gsutil_command} ls -d {url}')
|
103
|
-
command = ' && '.join(commands)
|
104
|
-
p = subprocess.run(
|
105
|
-
command,
|
106
|
-
stdout=subprocess.PIPE,
|
107
|
-
shell=True,
|
108
|
-
check=True,
|
109
|
-
executable='/bin/bash',
|
110
|
-
)
|
111
|
-
out = p.stdout.decode().strip()
|
112
|
-
# Edge Case: Gcloud command is run for first time #437
|
113
|
-
out = out.split('\n')[-1]
|
114
|
-
# If <url> is a bucket root, then we only need `gsutil` to succeed
|
115
|
-
# to make sure the bucket exists. It is already a directory.
|
116
|
-
_, key = data_utils.split_gcs_path(url)
|
117
|
-
if not key:
|
118
|
-
return True
|
119
|
-
# Otherwise, gsutil ls -d url will return:
|
120
|
-
# --> url.rstrip('/') if url is not a directory
|
121
|
-
# --> url with an ending '/' if url is a directory
|
122
|
-
if not out.endswith('/'):
|
123
|
-
assert out == url.rstrip('/'), (out, url)
|
124
|
-
return False
|
125
|
-
url = url if url.endswith('/') else (url + '/')
|
126
|
-
assert out == url, (out, url)
|
127
|
-
return True
|
128
|
-
|
129
|
-
def make_sync_dir_command(self, source: str, destination: str) -> str:
|
130
|
-
"""Downloads a directory using gsutil."""
|
131
|
-
download_via_gsutil = (
|
132
|
-
f'{self._gsutil_command} ' f'rsync -e -r {source} {destination}'
|
133
|
-
)
|
134
|
-
all_commands = [self._INSTALL_GSUTIL]
|
135
|
-
all_commands.append(download_via_gsutil)
|
136
|
-
return ' && '.join(all_commands)
|
137
|
-
|
138
|
-
def make_sync_file_command(self, source: str, destination: str) -> str:
|
139
|
-
"""Downloads a file using gsutil."""
|
140
|
-
download_via_gsutil = f'{self._gsutil_command} ' f'cp {source} {destination}'
|
141
|
-
all_commands = [self._INSTALL_GSUTIL]
|
142
|
-
all_commands.append(download_via_gsutil)
|
143
|
-
return ' && '.join(all_commands)
|
144
|
-
|
145
|
-
|
146
|
-
# Maps bucket's URIs prefix(scheme) to its corresponding storage class
|
147
|
-
_REGISTRY = {
|
148
|
-
'gs': GcsCloudStorage(),
|
149
|
-
# TODO(asaiacai): Add other cloud stores here
|
150
|
-
# 's3': S3CloudStorage(),
|
151
|
-
# 'r2': R2CloudStorage(),
|
152
|
-
# 'cos': IBMCosCloudStorage(),
|
153
|
-
# 'oci': OciCloudStorage(),
|
154
|
-
# # TODO: This is a hack, as Azure URL starts with https://, we should
|
155
|
-
# # refactor the registry to be able to take regex, so that Azure blob can
|
156
|
-
# # be identified with `https://(.*?)\.blob\.core\.windows\.net`
|
157
|
-
# 'https': AzureBlobCloudStorage()
|
158
|
-
}
|
File without changes
|
File without changes
|