konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +16 -6
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/common.py +88 -0
- konduktor/adaptors/gcp.py +112 -0
- konduktor/backends/__init__.py +8 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/jobset.py +218 -0
- konduktor/backends/jobset_utils.py +447 -0
- konduktor/check.py +192 -0
- konduktor/cli.py +790 -0
- konduktor/cloud_stores.py +158 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/constants.py +6 -6
- konduktor/controller/launch.py +3 -3
- konduktor/controller/node.py +5 -5
- konduktor/controller/parse.py +23 -23
- konduktor/dashboard/backend/main.py +57 -57
- konduktor/dashboard/backend/sockets.py +19 -19
- konduktor/data/__init__.py +9 -0
- konduktor/data/constants.py +12 -0
- konduktor/data/data_utils.py +223 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +906 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/storage.py +799 -0
- konduktor/data/storage_utils.py +500 -0
- konduktor/execution.py +444 -0
- konduktor/kube_client.py +153 -48
- konduktor/logging.py +49 -5
- konduktor/manifests/dmesg_daemonset.yaml +8 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +478 -0
- konduktor/task.py +867 -0
- konduktor/templates/jobset.yaml.j2 +31 -0
- konduktor/templates/pod.yaml.j2 +185 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +21 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +93 -0
- konduktor/utils/common_utils.py +393 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +226 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +652 -0
- konduktor/utils/log_utils.py +251 -0
- konduktor/utils/loki_utils.py +85 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +581 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +216 -0
- konduktor/utils/validator.py +20 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
- konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
- konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,223 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
import os
|
14
|
+
import urllib.parse
|
15
|
+
from multiprocessing import pool
|
16
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
17
|
+
|
18
|
+
from konduktor import logging
|
19
|
+
from konduktor.adaptors import gcp
|
20
|
+
from konduktor.utils import exceptions, log_utils, ux_utils
|
21
|
+
|
22
|
+
Client = Any
|
23
|
+
|
24
|
+
logger = logging.get_logger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
def split_gcs_path(gcs_path: str) -> Tuple[str, str]:
|
28
|
+
"""Splits GCS Path into Bucket name and Relative Path to Bucket
|
29
|
+
|
30
|
+
Args:
|
31
|
+
gcs_path: str; GCS Path, e.g. gcs://imagenet/train/
|
32
|
+
"""
|
33
|
+
path_parts = gcs_path.replace('gs://', '').split('/')
|
34
|
+
bucket = path_parts.pop(0)
|
35
|
+
key = '/'.join(path_parts)
|
36
|
+
return bucket, key
|
37
|
+
|
38
|
+
|
39
|
+
def verify_gcs_bucket(name: str) -> bool:
|
40
|
+
"""Helper method that checks if the GCS bucket exists
|
41
|
+
|
42
|
+
Args:
|
43
|
+
name: str; Name of GCS Bucket (without gs:// prefix)
|
44
|
+
"""
|
45
|
+
try:
|
46
|
+
gcp.storage_client().get_bucket(name)
|
47
|
+
return True
|
48
|
+
except gcp.not_found_exception():
|
49
|
+
return False
|
50
|
+
|
51
|
+
|
52
|
+
def is_cloud_store_url(url):
|
53
|
+
result = urllib.parse.urlsplit(url)
|
54
|
+
# '' means non-cloud URLs.
|
55
|
+
return result.netloc
|
56
|
+
|
57
|
+
|
58
|
+
def _group_files_by_dir(
|
59
|
+
source_list: List[str],
|
60
|
+
) -> Tuple[Dict[str, List[str]], List[str]]:
|
61
|
+
"""Groups a list of paths based on their directory
|
62
|
+
|
63
|
+
Given a list of paths, generates a dict of {dir_name: List[file_name]}
|
64
|
+
which groups files with same dir, and a list of dirs in the source_list.
|
65
|
+
|
66
|
+
This is used to optimize uploads by reducing the number of calls to rsync.
|
67
|
+
E.g., ['a/b/c.txt', 'a/b/d.txt', 'a/e.txt'] will be grouped into
|
68
|
+
{'a/b': ['c.txt', 'd.txt'], 'a': ['e.txt']}, and these three files can be
|
69
|
+
uploaded in two rsync calls instead of three.
|
70
|
+
|
71
|
+
Args:
|
72
|
+
source_list: List[str]; List of paths to group
|
73
|
+
"""
|
74
|
+
grouped_files: Dict[str, List[str]] = {}
|
75
|
+
dirs = []
|
76
|
+
for source in source_list:
|
77
|
+
source = os.path.abspath(os.path.expanduser(source))
|
78
|
+
if os.path.isdir(source):
|
79
|
+
dirs.append(source)
|
80
|
+
else:
|
81
|
+
base_path = os.path.dirname(source)
|
82
|
+
file_name = os.path.basename(source)
|
83
|
+
if base_path not in grouped_files:
|
84
|
+
grouped_files[base_path] = []
|
85
|
+
grouped_files[base_path].append(file_name)
|
86
|
+
return grouped_files, dirs
|
87
|
+
|
88
|
+
|
89
|
+
def parallel_upload(
|
90
|
+
source_path_list: List[str],
|
91
|
+
filesync_command_generator: Callable[[str, List[str]], str],
|
92
|
+
dirsync_command_generator: Callable[[str, str], str],
|
93
|
+
log_path: str,
|
94
|
+
bucket_name: str,
|
95
|
+
access_denied_message: str,
|
96
|
+
create_dirs: bool = False,
|
97
|
+
max_concurrent_uploads: Optional[int] = None,
|
98
|
+
) -> None:
|
99
|
+
"""Helper function to run parallel uploads for a list of paths.
|
100
|
+
|
101
|
+
Used by Store to run rsync commands in parallel by
|
102
|
+
providing appropriate command generators.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
source_path_list: List of paths to local files or directories
|
106
|
+
filesync_command_generator: Callable that generates rsync command
|
107
|
+
for a list of files belonging to the same dir.
|
108
|
+
dirsync_command_generator: Callable that generates rsync command
|
109
|
+
for a directory.
|
110
|
+
log_path: Path to the log file
|
111
|
+
bucket_name: Name of the bucket
|
112
|
+
access_denied_message: Message to intercept from the underlying
|
113
|
+
upload utility when permissions are insufficient. Used in
|
114
|
+
exception handling.
|
115
|
+
create_dirs: If the local_path is a directory and this is set to
|
116
|
+
False, the contents of the directory are directly uploaded to
|
117
|
+
root of the bucket. If the local_path is a directory and this is
|
118
|
+
set to True, the directory is created in the bucket root and
|
119
|
+
contents are uploaded to it.
|
120
|
+
max_concurrent_uploads: Maximum number of concurrent threads to use
|
121
|
+
to upload files.
|
122
|
+
"""
|
123
|
+
# Generate gsutil rsync command for files and dirs
|
124
|
+
commands = []
|
125
|
+
grouped_files, dirs = _group_files_by_dir(source_path_list)
|
126
|
+
# Generate file upload commands
|
127
|
+
for dir_path, file_names in grouped_files.items():
|
128
|
+
sync_command = filesync_command_generator(dir_path, file_names)
|
129
|
+
commands.append(sync_command)
|
130
|
+
# Generate dir upload commands
|
131
|
+
for dir_path in dirs:
|
132
|
+
if create_dirs:
|
133
|
+
dest_dir_name = os.path.basename(dir_path)
|
134
|
+
else:
|
135
|
+
dest_dir_name = ''
|
136
|
+
sync_command = dirsync_command_generator(dir_path, dest_dir_name)
|
137
|
+
commands.append(sync_command)
|
138
|
+
|
139
|
+
# Run commands in parallel
|
140
|
+
with pool.ThreadPool(processes=max_concurrent_uploads) as p:
|
141
|
+
p.starmap(
|
142
|
+
run_upload_cli,
|
143
|
+
zip(
|
144
|
+
commands,
|
145
|
+
[access_denied_message] * len(commands),
|
146
|
+
[bucket_name] * len(commands),
|
147
|
+
[log_path] * len(commands),
|
148
|
+
),
|
149
|
+
)
|
150
|
+
|
151
|
+
|
152
|
+
def get_gsutil_command() -> Tuple[str, str]:
|
153
|
+
"""Gets the alias'd command for gsutil and a command to define the alias.
|
154
|
+
|
155
|
+
This is required for applying platform-specific flags to gsutil.
|
156
|
+
|
157
|
+
In particular, we disable multiprocessing on Mac using
|
158
|
+
`-o "GSUtil:parallel_process_count=1"`. Multithreading is still enabled.
|
159
|
+
gsutil on Mac has a bug with multiprocessing that causes it to crash
|
160
|
+
when uploading files. Related issues:
|
161
|
+
https://bugs.python.org/issue33725
|
162
|
+
https://github.com/GoogleCloudPlatform/gsutil/issues/464
|
163
|
+
|
164
|
+
The flags are added by checking the platform using bash in a one-liner.
|
165
|
+
The platform check is done inline to have the flags match where the command
|
166
|
+
is executed, rather than where the code is run. This is important when
|
167
|
+
the command is run in a remote VM.
|
168
|
+
|
169
|
+
Returns:
|
170
|
+
Tuple[str, str] : (gsutil_alias, command to generate the alias)
|
171
|
+
The command to generate alias must be run before using the alias. E.g.,
|
172
|
+
```
|
173
|
+
gsutil_alias, alias_gen = get_gsutil_command()
|
174
|
+
cmd_to_run = f'{alias_gen}; {gsutil_alias} cp ...'
|
175
|
+
```
|
176
|
+
"""
|
177
|
+
gsutil_alias = 'skypilot_gsutil'
|
178
|
+
disable_multiprocessing_flag = '-o "GSUtil:parallel_process_count=1"'
|
179
|
+
|
180
|
+
# Define skypilot_gsutil as a shell function instead of an alias.
|
181
|
+
# This function will behave just like alias, but can be called immediately
|
182
|
+
# after its definition on the same line
|
183
|
+
alias_gen = (
|
184
|
+
f'[[ "$(uname)" == "Darwin" ]] && {gsutil_alias}() {{ '
|
185
|
+
f'gsutil -m {disable_multiprocessing_flag} "$@"; }} '
|
186
|
+
f'|| {gsutil_alias}() {{ gsutil -m "$@"; }}'
|
187
|
+
)
|
188
|
+
|
189
|
+
return gsutil_alias, alias_gen
|
190
|
+
|
191
|
+
|
192
|
+
def run_upload_cli(
|
193
|
+
command: str, access_denied_message: str, bucket_name: str, log_path: str
|
194
|
+
):
|
195
|
+
returncode, stdout, stderr = log_utils.run_with_log( # type: ignore[misc]
|
196
|
+
command,
|
197
|
+
log_path,
|
198
|
+
shell=True,
|
199
|
+
require_outputs=True,
|
200
|
+
# We need to use bash as some of the cloud commands uses bash syntax,
|
201
|
+
# such as [[ ... ]]
|
202
|
+
executable='/bin/bash',
|
203
|
+
)
|
204
|
+
if access_denied_message in stderr:
|
205
|
+
with ux_utils.print_exception_no_traceback():
|
206
|
+
raise PermissionError(
|
207
|
+
'Failed to upload files to '
|
208
|
+
'the remote bucket. The bucket does not have '
|
209
|
+
'write permissions. It is possible that '
|
210
|
+
'the bucket is public.'
|
211
|
+
)
|
212
|
+
if returncode != 0:
|
213
|
+
with ux_utils.print_exception_no_traceback():
|
214
|
+
logger.error(stderr)
|
215
|
+
raise exceptions.StorageUploadError(
|
216
|
+
f'Upload to bucket failed for store {bucket_name}. '
|
217
|
+
f'Please check the logs: {log_path}'
|
218
|
+
)
|
219
|
+
if not stdout:
|
220
|
+
logger.debug(
|
221
|
+
'No file uploaded. This could be due to an error or '
|
222
|
+
'because all files already exist on the cloud.'
|
223
|
+
)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""Data sync between workstation <--> blob (s3, gcs, etc.) <--> worker pods"""
|
2
|
+
|
3
|
+
from konduktor.data.gcp.constants import (
|
4
|
+
DEFAULT_SERVICE_ACCOUNT_ROLES,
|
5
|
+
STORAGE_MINIMAL_PERMISSIONS,
|
6
|
+
)
|
7
|
+
from konduktor.data.gcp.gcs import (
|
8
|
+
DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH,
|
9
|
+
GOOGLE_SDK_INSTALLATION_COMMAND,
|
10
|
+
GcsStore,
|
11
|
+
)
|
12
|
+
|
13
|
+
__all__ = [
|
14
|
+
'GcsStore',
|
15
|
+
'DEFAULT_SERVICE_ACCOUNT_ROLES',
|
16
|
+
'STORAGE_MINIMAL_PERMISSIONS',
|
17
|
+
'GOOGLE_SDK_INSTALLATION_COMMAND',
|
18
|
+
'DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH',
|
19
|
+
]
|
@@ -0,0 +1,42 @@
|
|
1
|
+
VERSION = 'v1'
|
2
|
+
|
3
|
+
KONDUKTOR = 'KONDUKTOR'
|
4
|
+
KONDUKTOR_SERVICE_ACCOUNT_ID = KONDUKTOR + '-' + VERSION
|
5
|
+
KONDUKTOR_SERVICE_ACCOUNT_EMAIL_TEMPLATE = (
|
6
|
+
'{account_id}@{project_id}.iam.gserviceaccount.com'
|
7
|
+
)
|
8
|
+
KONDUKTOR_SERVICE_ACCOUNT_CONFIG = {
|
9
|
+
'displayName': f'KONDUKTOR Service Account ({VERSION})',
|
10
|
+
}
|
11
|
+
|
12
|
+
# Those roles will be always added.
|
13
|
+
# NOTE: `serviceAccountUser` allows the head node to create workers with
|
14
|
+
# a serviceAccount. `roleViewer` allows the head node to run bootstrap_gcp.
|
15
|
+
DEFAULT_SERVICE_ACCOUNT_ROLES = [
|
16
|
+
'roles/storage.admin',
|
17
|
+
'roles/iam.serviceAccountUser',
|
18
|
+
'roles/iam.roleViewer',
|
19
|
+
]
|
20
|
+
|
21
|
+
# A list of permissions required to run Konduktor on GCP.
|
22
|
+
# Keep this in sync with https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html # noqa: E501
|
23
|
+
STORAGE_MINIMAL_PERMISSIONS = [
|
24
|
+
'iam.roles.get',
|
25
|
+
# We now skip the check for `iam.serviceAccounts.actAs` permission for
|
26
|
+
# simplicity as it can be granted at the service-account level.
|
27
|
+
# Check: sky.provision.gcp.config::_is_permission_satisfied
|
28
|
+
# 'iam.serviceAccounts.actAs',
|
29
|
+
'iam.serviceAccounts.get',
|
30
|
+
'serviceusage.services.enable',
|
31
|
+
'serviceusage.services.list',
|
32
|
+
'serviceusage.services.use',
|
33
|
+
'storage.buckets.create',
|
34
|
+
'storage.buckets.get',
|
35
|
+
'storage.buckets.delete',
|
36
|
+
'storage.objects.create',
|
37
|
+
'storage.objects.delete',
|
38
|
+
'storage.objects.update',
|
39
|
+
'storage.objects.get',
|
40
|
+
'storage.objects.list',
|
41
|
+
'resourcemanager.projects.get',
|
42
|
+
]
|