konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,7 @@
1
+ from typing import Any, Union
2
+
3
+ Path = str
4
+ SourceType = Union[Path]
5
+ StorageHandle = Any
6
+
7
+ _STORAGE_LOG_FILE_NAME = 'storage.log'
@@ -0,0 +1,268 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ import os
14
+ import urllib.parse
15
+ from multiprocessing import pool
16
+ from typing import Any, Callable, Dict, List, Optional, Tuple
17
+
18
+ from konduktor import logging
19
+ from konduktor.adaptors import aws, gcp
20
+ from konduktor.utils import exceptions, log_utils, ux_utils
21
+
22
+ Client = Any
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+
27
+ def split_gcs_path(gcs_path: str) -> Tuple[str, str]:
28
+ """Splits GCS Path into Bucket name and Relative Path to Bucket
29
+
30
+ Args:
31
+ gcs_path: str; GCS Path, e.g. gcs://imagenet/train/
32
+ """
33
+ path_parts = gcs_path.replace('gs://', '').split('/')
34
+ bucket = path_parts.pop(0)
35
+ key = '/'.join(path_parts)
36
+ return bucket, key
37
+
38
+
39
+ def split_s3_path(s3_path: str) -> Tuple[str, str]:
40
+ """Splits S3 Path into Bucket name and Relative Path to Bucket
41
+
42
+ Args:
43
+ s3_path: str; S3 Path, e.g. s3://imagenet/train/
44
+ """
45
+ path_parts = s3_path.replace('s3://', '').split('/')
46
+ bucket = path_parts.pop(0)
47
+ key = '/'.join(path_parts)
48
+ return bucket, key
49
+
50
+
51
+ def create_s3_client(region: Optional[str] = None) -> Client:
52
+ """Helper method that connects to Boto3 client for S3 Bucket
53
+
54
+ Args:
55
+ region: str; Region name, e.g. us-west-1, us-east-2. If None, default
56
+ region us-east-1 is used.
57
+ """
58
+ if region is None:
59
+ region = 'us-east-1'
60
+ return aws.client('s3', region_name=region)
61
+
62
+
63
+ def verify_gcs_bucket(name: str) -> bool:
64
+ """Helper method that checks if the GCS bucket exists
65
+
66
+ Args:
67
+ name: str; Name of GCS Bucket (without gs:// prefix)
68
+ """
69
+ try:
70
+ gcp.storage_client().get_bucket(name)
71
+ return True
72
+ except gcp.not_found_exception():
73
+ return False
74
+
75
+
76
+ def verify_s3_bucket(name: str) -> bool:
77
+ """Helper method that checks if the S3 bucket exists
78
+
79
+ Args:
80
+ name: str; Name of the S3 Bucket (without s3:// prefix)
81
+ """
82
+ import boto3
83
+ from botocore.exceptions import ClientError
84
+
85
+ s3 = boto3.client('s3')
86
+ try:
87
+ s3.head_bucket(Bucket=name)
88
+ return True
89
+ except ClientError as e:
90
+ error_code = int(e.response['Error']['Code'])
91
+ if error_code == 404:
92
+ return False
93
+ # In case of permissions issues or other errors, you can log and assume False
94
+ return False
95
+
96
+
97
+ def is_cloud_store_url(url):
98
+ result = urllib.parse.urlsplit(url)
99
+ # '' means non-cloud URLs.
100
+ return result.netloc
101
+
102
+
103
+ def _group_files_by_dir(
104
+ source_list: List[str],
105
+ ) -> Tuple[Dict[str, List[str]], List[str]]:
106
+ """Groups a list of paths based on their directory
107
+
108
+ Given a list of paths, generates a dict of {dir_name: List[file_name]}
109
+ which groups files with same dir, and a list of dirs in the source_list.
110
+
111
+ This is used to optimize uploads by reducing the number of calls to rsync.
112
+ E.g., ['a/b/c.txt', 'a/b/d.txt', 'a/e.txt'] will be grouped into
113
+ {'a/b': ['c.txt', 'd.txt'], 'a': ['e.txt']}, and these three files can be
114
+ uploaded in two rsync calls instead of three.
115
+
116
+ Args:
117
+ source_list: List[str]; List of paths to group
118
+ """
119
+ grouped_files: Dict[str, List[str]] = {}
120
+ dirs = []
121
+ for source in source_list:
122
+ source = os.path.abspath(os.path.expanduser(source))
123
+ if os.path.isdir(source):
124
+ dirs.append(source)
125
+ else:
126
+ base_path = os.path.dirname(source)
127
+ file_name = os.path.basename(source)
128
+ if base_path not in grouped_files:
129
+ grouped_files[base_path] = []
130
+ grouped_files[base_path].append(file_name)
131
+ return grouped_files, dirs
132
+
133
+
134
+ def parallel_upload(
135
+ source_path_list: List[str],
136
+ filesync_command_generator: Callable[[str, List[str]], str],
137
+ dirsync_command_generator: Callable[[str, str], str],
138
+ log_path: str,
139
+ bucket_name: str,
140
+ access_denied_message: str,
141
+ create_dirs: bool = False,
142
+ max_concurrent_uploads: Optional[int] = None,
143
+ ) -> None:
144
+ """Helper function to run parallel uploads for a list of paths.
145
+
146
+ Used by Store to run rsync commands in parallel by
147
+ providing appropriate command generators.
148
+
149
+ Args:
150
+ source_path_list: List of paths to local files or directories
151
+ filesync_command_generator: Callable that generates rsync command
152
+ for a list of files belonging to the same dir.
153
+ dirsync_command_generator: Callable that generates rsync command
154
+ for a directory.
155
+ log_path: Path to the log file
156
+ bucket_name: Name of the bucket
157
+ access_denied_message: Message to intercept from the underlying
158
+ upload utility when permissions are insufficient. Used in
159
+ exception handling.
160
+ create_dirs: If the local_path is a directory and this is set to
161
+ False, the contents of the directory are directly uploaded to
162
+ root of the bucket. If the local_path is a directory and this is
163
+ set to True, the directory is created in the bucket root and
164
+ contents are uploaded to it.
165
+ max_concurrent_uploads: Maximum number of concurrent threads to use
166
+ to upload files.
167
+ """
168
+ # Generate gsutil rsync command for files and dirs
169
+ commands = []
170
+ grouped_files, dirs = _group_files_by_dir(source_path_list)
171
+ # Generate file upload commands
172
+ for dir_path, file_names in grouped_files.items():
173
+ sync_command = filesync_command_generator(dir_path, file_names)
174
+ commands.append(sync_command)
175
+ # Generate dir upload commands
176
+ for dir_path in dirs:
177
+ if create_dirs:
178
+ dest_dir_name = os.path.basename(dir_path)
179
+ else:
180
+ dest_dir_name = ''
181
+ sync_command = dirsync_command_generator(dir_path, dest_dir_name)
182
+ commands.append(sync_command)
183
+
184
+ # Run commands in parallel
185
+ with pool.ThreadPool(processes=max_concurrent_uploads) as p:
186
+ p.starmap(
187
+ run_upload_cli,
188
+ zip(
189
+ commands,
190
+ [access_denied_message] * len(commands),
191
+ [bucket_name] * len(commands),
192
+ [log_path] * len(commands),
193
+ ),
194
+ )
195
+
196
+
197
+ def get_gsutil_command() -> Tuple[str, str]:
198
+ """Gets the alias'd command for gsutil and a command to define the alias.
199
+
200
+ This is required for applying platform-specific flags to gsutil.
201
+
202
+ In particular, we disable multiprocessing on Mac using
203
+ `-o "GSUtil:parallel_process_count=1"`. Multithreading is still enabled.
204
+ gsutil on Mac has a bug with multiprocessing that causes it to crash
205
+ when uploading files. Related issues:
206
+ https://bugs.python.org/issue33725
207
+ https://github.com/GoogleCloudPlatform/gsutil/issues/464
208
+
209
+ The flags are added by checking the platform using bash in a one-liner.
210
+ The platform check is done inline to have the flags match where the command
211
+ is executed, rather than where the code is run. This is important when
212
+ the command is run in a remote VM.
213
+
214
+ Returns:
215
+ Tuple[str, str] : (gsutil_alias, command to generate the alias)
216
+ The command to generate alias must be run before using the alias. E.g.,
217
+ ```
218
+ gsutil_alias, alias_gen = get_gsutil_command()
219
+ cmd_to_run = f'{alias_gen}; {gsutil_alias} cp ...'
220
+ ```
221
+ """
222
+ gsutil_alias = 'konduktor_gsutil'
223
+ disable_multiprocessing_flag = '-o "GSUtil:parallel_process_count=1"'
224
+
225
+ # Define konduktor_gsutil as a shell function instead of an alias.
226
+ # This function will behave just like alias, but can be called immediately
227
+ # after its definition on the same line
228
+ alias_gen = (
229
+ f'[[ "$(uname)" == "Darwin" ]] && {gsutil_alias}() {{ '
230
+ f'gsutil -m {disable_multiprocessing_flag} "$@"; }} '
231
+ f'|| {gsutil_alias}() {{ gsutil -m "$@"; }}'
232
+ )
233
+
234
+ return gsutil_alias, alias_gen
235
+
236
+
237
+ def run_upload_cli(
238
+ command: str, access_denied_message: str, bucket_name: str, log_path: str
239
+ ):
240
+ returncode, stdout, stderr = log_utils.run_with_log( # type: ignore[misc]
241
+ command,
242
+ log_path,
243
+ shell=True,
244
+ require_outputs=True,
245
+ # We need to use bash as some of the cloud commands uses bash syntax,
246
+ # such as [[ ... ]]
247
+ executable='/bin/bash',
248
+ )
249
+ if access_denied_message in stderr:
250
+ with ux_utils.print_exception_no_traceback():
251
+ raise PermissionError(
252
+ 'Failed to upload files to '
253
+ 'the remote bucket. The bucket does not have '
254
+ 'write permissions. It is possible that '
255
+ 'the bucket is public.'
256
+ )
257
+ if returncode != 0:
258
+ with ux_utils.print_exception_no_traceback():
259
+ logger.error(stderr)
260
+ raise exceptions.StorageUploadError(
261
+ f'Upload to bucket failed for store {bucket_name}. '
262
+ f'Please check the logs: {log_path}'
263
+ )
264
+ if not stdout:
265
+ logger.debug(
266
+ 'No file uploaded. This could be due to an error or '
267
+ 'because all files already exist on the cloud.'
268
+ )
@@ -0,0 +1,19 @@
1
+ """Data sync between workstation <--> blob (s3, gcs, etc.) <--> worker pods"""
2
+
3
+ from konduktor.data.gcp.constants import (
4
+ STORAGE_MINIMAL_PERMISSIONS,
5
+ )
6
+ from konduktor.data.gcp.gcs import (
7
+ DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH,
8
+ GOOGLE_SDK_INSTALLATION_COMMAND,
9
+ GcsCloudStorage,
10
+ GcsStore,
11
+ )
12
+
13
+ __all__ = [
14
+ 'GcsStore',
15
+ 'GcsCloudStorage',
16
+ 'STORAGE_MINIMAL_PERMISSIONS',
17
+ 'GOOGLE_SDK_INSTALLATION_COMMAND',
18
+ 'DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH',
19
+ ]
@@ -0,0 +1,42 @@
1
+ VERSION = 'v1'
2
+
3
+ KONDUKTOR = 'KONDUKTOR'
4
+ KONDUKTOR_SERVICE_ACCOUNT_ID = KONDUKTOR + '-' + VERSION
5
+ KONDUKTOR_SERVICE_ACCOUNT_EMAIL_TEMPLATE = (
6
+ '{account_id}@{project_id}.iam.gserviceaccount.com'
7
+ )
8
+ KONDUKTOR_SERVICE_ACCOUNT_CONFIG = {
9
+ 'displayName': f'KONDUKTOR Service Account ({VERSION})',
10
+ }
11
+
12
+ # Those roles will be always added.
13
+ # NOTE: `serviceAccountUser` allows the head node to create workers with
14
+ # a serviceAccount. `roleViewer` allows the head node to run bootstrap_gcp.
15
+ DEFAULT_SERVICE_ACCOUNT_ROLES = [
16
+ 'roles/storage.admin',
17
+ 'roles/iam.serviceAccountUser',
18
+ 'roles/iam.roleViewer',
19
+ ]
20
+
21
+ # A list of permissions required to run Konduktor on GCP.
22
+ # Keep this in sync with https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html # noqa: E501
23
+ STORAGE_MINIMAL_PERMISSIONS = [
24
+ 'iam.roles.get',
25
+ # We now skip the check for `iam.serviceAccounts.actAs` permission for
26
+ # simplicity as it can be granted at the service-account level.
27
+ # Check: sky.provision.gcp.config::_is_permission_satisfied
28
+ # 'iam.serviceAccounts.actAs',
29
+ 'iam.serviceAccounts.get',
30
+ 'serviceusage.services.enable',
31
+ 'serviceusage.services.list',
32
+ 'serviceusage.services.use',
33
+ 'storage.buckets.create',
34
+ 'storage.buckets.get',
35
+ 'storage.buckets.delete',
36
+ 'storage.objects.create',
37
+ 'storage.objects.delete',
38
+ 'storage.objects.update',
39
+ 'storage.objects.get',
40
+ 'storage.objects.list',
41
+ 'resourcemanager.projects.get',
42
+ ]