konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,223 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ import os
14
+ import urllib.parse
15
+ from multiprocessing import pool
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple
17
+
18
+ from konduktor import logging
19
+ from konduktor.adaptors import gcp
20
+ from konduktor.utils import exceptions, log_utils, ux_utils
21
+
22
+ Client = Any
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+
27
+ def split_gcs_path(gcs_path: str) -> Tuple[str, str]:
28
+ """Splits GCS Path into Bucket name and Relative Path to Bucket
29
+
30
+ Args:
31
+ gcs_path: str; GCS Path, e.g. gcs://imagenet/train/
32
+ """
33
+ path_parts = gcs_path.replace('gs://', '').split('/')
34
+ bucket = path_parts.pop(0)
35
+ key = '/'.join(path_parts)
36
+ return bucket, key
37
+
38
+
39
+ def verify_gcs_bucket(name: str) -> bool:
40
+ """Helper method that checks if the GCS bucket exists
41
+
42
+ Args:
43
+ name: str; Name of GCS Bucket (without gs:// prefix)
44
+ """
45
+ try:
46
+ gcp.storage_client().get_bucket(name)
47
+ return True
48
+ except gcp.not_found_exception():
49
+ return False
50
+
51
+
52
+ def is_cloud_store_url(url):
53
+ result = urllib.parse.urlsplit(url)
54
+ # '' means non-cloud URLs.
55
+ return result.netloc
56
+
57
+
58
+ def _group_files_by_dir(
59
+ source_list: List[str],
60
+ ) -> Tuple[Dict[str, List[str]], List[str]]:
61
+ """Groups a list of paths based on their directory
62
+
63
+ Given a list of paths, generates a dict of {dir_name: List[file_name]}
64
+ which groups files with same dir, and a list of dirs in the source_list.
65
+
66
+ This is used to optimize uploads by reducing the number of calls to rsync.
67
+ E.g., ['a/b/c.txt', 'a/b/d.txt', 'a/e.txt'] will be grouped into
68
+ {'a/b': ['c.txt', 'd.txt'], 'a': ['e.txt']}, and these three files can be
69
+ uploaded in two rsync calls instead of three.
70
+
71
+ Args:
72
+ source_list: List[str]; List of paths to group
73
+ """
74
+ grouped_files: Dict[str, List[str]] = {}
75
+ dirs = []
76
+ for source in source_list:
77
+ source = os.path.abspath(os.path.expanduser(source))
78
+ if os.path.isdir(source):
79
+ dirs.append(source)
80
+ else:
81
+ base_path = os.path.dirname(source)
82
+ file_name = os.path.basename(source)
83
+ if base_path not in grouped_files:
84
+ grouped_files[base_path] = []
85
+ grouped_files[base_path].append(file_name)
86
+ return grouped_files, dirs
87
+
88
+
89
+ def parallel_upload(
90
+ source_path_list: List[str],
91
+ filesync_command_generator: Callable[[str, List[str]], str],
92
+ dirsync_command_generator: Callable[[str, str], str],
93
+ log_path: str,
94
+ bucket_name: str,
95
+ access_denied_message: str,
96
+ create_dirs: bool = False,
97
+ max_concurrent_uploads: Optional[int] = None,
98
+ ) -> None:
99
+ """Helper function to run parallel uploads for a list of paths.
100
+
101
+ Used by Store to run rsync commands in parallel by
102
+ providing appropriate command generators.
103
+
104
+ Args:
105
+ source_path_list: List of paths to local files or directories
106
+ filesync_command_generator: Callable that generates rsync command
107
+ for a list of files belonging to the same dir.
108
+ dirsync_command_generator: Callable that generates rsync command
109
+ for a directory.
110
+ log_path: Path to the log file
111
+ bucket_name: Name of the bucket
112
+ access_denied_message: Message to intercept from the underlying
113
+ upload utility when permissions are insufficient. Used in
114
+ exception handling.
115
+ create_dirs: If the local_path is a directory and this is set to
116
+ False, the contents of the directory are directly uploaded to
117
+ root of the bucket. If the local_path is a directory and this is
118
+ set to True, the directory is created in the bucket root and
119
+ contents are uploaded to it.
120
+ max_concurrent_uploads: Maximum number of concurrent threads to use
121
+ to upload files.
122
+ """
123
+ # Generate gsutil rsync command for files and dirs
124
+ commands = []
125
+ grouped_files, dirs = _group_files_by_dir(source_path_list)
126
+ # Generate file upload commands
127
+ for dir_path, file_names in grouped_files.items():
128
+ sync_command = filesync_command_generator(dir_path, file_names)
129
+ commands.append(sync_command)
130
+ # Generate dir upload commands
131
+ for dir_path in dirs:
132
+ if create_dirs:
133
+ dest_dir_name = os.path.basename(dir_path)
134
+ else:
135
+ dest_dir_name = ''
136
+ sync_command = dirsync_command_generator(dir_path, dest_dir_name)
137
+ commands.append(sync_command)
138
+
139
+ # Run commands in parallel
140
+ with pool.ThreadPool(processes=max_concurrent_uploads) as p:
141
+ p.starmap(
142
+ run_upload_cli,
143
+ zip(
144
+ commands,
145
+ [access_denied_message] * len(commands),
146
+ [bucket_name] * len(commands),
147
+ [log_path] * len(commands),
148
+ ),
149
+ )
150
+
151
+
152
+ def get_gsutil_command() -> Tuple[str, str]:
153
+ """Gets the alias'd command for gsutil and a command to define the alias.
154
+
155
+ This is required for applying platform-specific flags to gsutil.
156
+
157
+ In particular, we disable multiprocessing on Mac using
158
+ `-o "GSUtil:parallel_process_count=1"`. Multithreading is still enabled.
159
+ gsutil on Mac has a bug with multiprocessing that causes it to crash
160
+ when uploading files. Related issues:
161
+ https://bugs.python.org/issue33725
162
+ https://github.com/GoogleCloudPlatform/gsutil/issues/464
163
+
164
+ The flags are added by checking the platform using bash in a one-liner.
165
+ The platform check is done inline to have the flags match where the command
166
+ is executed, rather than where the code is run. This is important when
167
+ the command is run in a remote VM.
168
+
169
+ Returns:
170
+ Tuple[str, str] : (gsutil_alias, command to generate the alias)
171
+ The command to generate alias must be run before using the alias. E.g.,
172
+ ```
173
+ gsutil_alias, alias_gen = get_gsutil_command()
174
+ cmd_to_run = f'{alias_gen}; {gsutil_alias} cp ...'
175
+ ```
176
+ """
177
+ gsutil_alias = 'skypilot_gsutil'
178
+ disable_multiprocessing_flag = '-o "GSUtil:parallel_process_count=1"'
179
+
180
+ # Define skypilot_gsutil as a shell function instead of an alias.
181
+ # This function will behave just like alias, but can be called immediately
182
+ # after its definition on the same line
183
+ alias_gen = (
184
+ f'[[ "$(uname)" == "Darwin" ]] && {gsutil_alias}() {{ '
185
+ f'gsutil -m {disable_multiprocessing_flag} "$@"; }} '
186
+ f'|| {gsutil_alias}() {{ gsutil -m "$@"; }}'
187
+ )
188
+
189
+ return gsutil_alias, alias_gen
190
+
191
+
192
+ def run_upload_cli(
193
+ command: str, access_denied_message: str, bucket_name: str, log_path: str
194
+ ):
195
+ returncode, stdout, stderr = log_utils.run_with_log( # type: ignore[misc]
196
+ command,
197
+ log_path,
198
+ shell=True,
199
+ require_outputs=True,
200
+ # We need to use bash as some of the cloud commands uses bash syntax,
201
+ # such as [[ ... ]]
202
+ executable='/bin/bash',
203
+ )
204
+ if access_denied_message in stderr:
205
+ with ux_utils.print_exception_no_traceback():
206
+ raise PermissionError(
207
+ 'Failed to upload files to '
208
+ 'the remote bucket. The bucket does not have '
209
+ 'write permissions. It is possible that '
210
+ 'the bucket is public.'
211
+ )
212
+ if returncode != 0:
213
+ with ux_utils.print_exception_no_traceback():
214
+ logger.error(stderr)
215
+ raise exceptions.StorageUploadError(
216
+ f'Upload to bucket failed for store {bucket_name}. '
217
+ f'Please check the logs: {log_path}'
218
+ )
219
+ if not stdout:
220
+ logger.debug(
221
+ 'No file uploaded. This could be due to an error or '
222
+ 'because all files already exist on the cloud.'
223
+ )
@@ -0,0 +1,19 @@
1
+ """Data sync between workstation <--> blob (s3, gcs, etc.) <--> worker pods"""
2
+
3
+ from konduktor.data.gcp.constants import (
4
+ DEFAULT_SERVICE_ACCOUNT_ROLES,
5
+ STORAGE_MINIMAL_PERMISSIONS,
6
+ )
7
+ from konduktor.data.gcp.gcs import (
8
+ DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH,
9
+ GOOGLE_SDK_INSTALLATION_COMMAND,
10
+ GcsStore,
11
+ )
12
+
13
+ __all__ = [
14
+ 'GcsStore',
15
+ 'DEFAULT_SERVICE_ACCOUNT_ROLES',
16
+ 'STORAGE_MINIMAL_PERMISSIONS',
17
+ 'GOOGLE_SDK_INSTALLATION_COMMAND',
18
+ 'DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH',
19
+ ]
@@ -0,0 +1,42 @@
1
+ VERSION = 'v1'
2
+
3
+ KONDUKTOR = 'KONDUKTOR'
4
+ KONDUKTOR_SERVICE_ACCOUNT_ID = KONDUKTOR + '-' + VERSION
5
+ KONDUKTOR_SERVICE_ACCOUNT_EMAIL_TEMPLATE = (
6
+ '{account_id}@{project_id}.iam.gserviceaccount.com'
7
+ )
8
+ KONDUKTOR_SERVICE_ACCOUNT_CONFIG = {
9
+ 'displayName': f'KONDUKTOR Service Account ({VERSION})',
10
+ }
11
+
12
+ # Those roles will be always added.
13
+ # NOTE: `serviceAccountUser` allows the head node to create workers with
14
+ # a serviceAccount. `roleViewer` allows the head node to run bootstrap_gcp.
15
+ DEFAULT_SERVICE_ACCOUNT_ROLES = [
16
+ 'roles/storage.admin',
17
+ 'roles/iam.serviceAccountUser',
18
+ 'roles/iam.roleViewer',
19
+ ]
20
+
21
+ # A list of permissions required to run Konduktor on GCP.
22
+ # Keep this in sync with https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html # noqa: E501
23
+ STORAGE_MINIMAL_PERMISSIONS = [
24
+ 'iam.roles.get',
25
+ # We now skip the check for `iam.serviceAccounts.actAs` permission for
26
+ # simplicity as it can be granted at the service-account level.
27
+ # Check: sky.provision.gcp.config::_is_permission_satisfied
28
+ # 'iam.serviceAccounts.actAs',
29
+ 'iam.serviceAccounts.get',
30
+ 'serviceusage.services.enable',
31
+ 'serviceusage.services.list',
32
+ 'serviceusage.services.use',
33
+ 'storage.buckets.create',
34
+ 'storage.buckets.get',
35
+ 'storage.buckets.delete',
36
+ 'storage.objects.create',
37
+ 'storage.objects.delete',
38
+ 'storage.objects.update',
39
+ 'storage.objects.get',
40
+ 'storage.objects.list',
41
+ 'resourcemanager.projects.get',
42
+ ]