konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,500 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Utility functions for the storage module."""
14
+
15
+ import glob
16
+ import os
17
+ import shlex
18
+ import subprocess
19
+ import typing
20
+ from typing import List, Optional
21
+
22
+ import colorama
23
+
24
+ if typing.TYPE_CHECKING:
25
+ from konduktor.data.constants import SourceType, StorageHandle
26
+
27
+
28
+ from konduktor import constants, logging
29
+ from konduktor.utils import common_utils, exceptions
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+ _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG = (
34
+ f'{colorama.Fore.YELLOW}Warning: Files/dirs '
35
+ 'specified in .gitignore will be uploaded '
36
+ 'to the cloud storage for {path!r}'
37
+ 'due to the following error: {error_msg!r}'
38
+ )
39
+
40
+
41
+ def get_excluded_files_from_konduktorignore(src_dir_path: str) -> List[str]:
42
+ """List files and patterns ignored by the .konduktor file
43
+ in the given source directory.
44
+ """
45
+ excluded_list: List[str] = []
46
+ expand_src_dir_path = os.path.expanduser(src_dir_path)
47
+ konduktorignore_path = os.path.join(
48
+ expand_src_dir_path, constants.KONDUKTOR_IGNORE_FILE
49
+ )
50
+
51
+ try:
52
+ with open(konduktorignore_path, 'r', encoding='utf-8') as f:
53
+ for line in f:
54
+ line = line.strip()
55
+ if line and not line.startswith('#'):
56
+ # Make parsing consistent with rsync.
57
+ # Rsync uses '/' as current directory.
58
+ if line.startswith('/'):
59
+ line = '.' + line
60
+ else:
61
+ line = '**/' + line
62
+ # Find all files matching the pattern.
63
+ matching_files = glob.glob(
64
+ os.path.join(expand_src_dir_path, line), recursive=True
65
+ )
66
+ # Process filenames to comply with cloud rsync format.
67
+ for i in range(len(matching_files)):
68
+ matching_files[i] = os.path.relpath(
69
+ matching_files[i], expand_src_dir_path
70
+ )
71
+ excluded_list.extend(matching_files)
72
+ except IOError as e:
73
+ logger.warning(
74
+ f'Error reading {konduktorignore_path}: '
75
+ f'{common_utils.format_exception(e, use_bracket=True)}'
76
+ )
77
+
78
+ return excluded_list
79
+
80
+
81
+ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
82
+ """Lists files and patterns ignored by git in the source directory
83
+
84
+ Runs `git status --ignored` which returns a list of excluded files and
85
+ patterns read from .gitignore and .git/info/exclude using git.
86
+ `git init` is run if SRC_DIR_PATH is not a git repository and removed
87
+ after obtaining excluded list.
88
+
89
+ Returns:
90
+ List[str] containing files and patterns to be ignored. Some of the
91
+ patterns include, **/mydir/*.txt, !myfile.log, or file-*/.
92
+ """
93
+ expand_src_dir_path = os.path.expanduser(src_dir_path)
94
+
95
+ git_exclude_path = os.path.join(expand_src_dir_path, '.git/info/exclude')
96
+ gitignore_path = os.path.join(expand_src_dir_path, constants.GIT_IGNORE_FILE)
97
+
98
+ git_exclude_exists = os.path.isfile(git_exclude_path)
99
+ gitignore_exists = os.path.isfile(gitignore_path)
100
+
101
+ # This command outputs a list to be excluded according to .gitignore
102
+ # and .git/info/exclude
103
+ filter_cmd = (
104
+ f'git -C {shlex.quote(expand_src_dir_path)} ' 'status --ignored --porcelain=v1'
105
+ )
106
+ excluded_list: List[str] = []
107
+
108
+ if git_exclude_exists or gitignore_exists:
109
+ try:
110
+ output = subprocess.run(
111
+ filter_cmd,
112
+ shell=True,
113
+ stdout=subprocess.PIPE,
114
+ stderr=subprocess.PIPE,
115
+ check=True,
116
+ text=True,
117
+ )
118
+ except subprocess.CalledProcessError as e:
119
+ # when the SRC_DIR_PATH is not a git repo and .git
120
+ # does not exist in it
121
+ if e.returncode == exceptions.GIT_FATAL_EXIT_CODE:
122
+ if 'not a git repository' in e.stderr:
123
+ # Check if the user has 'write' permission to
124
+ # SRC_DIR_PATH
125
+ if not os.access(expand_src_dir_path, os.W_OK):
126
+ error_msg = 'Write permission denial'
127
+ logger.warning(
128
+ _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG.format(
129
+ path=src_dir_path, error_msg=error_msg
130
+ )
131
+ )
132
+ return excluded_list
133
+ init_cmd = f'git -C {expand_src_dir_path} init'
134
+ try:
135
+ subprocess.run(
136
+ init_cmd,
137
+ shell=True,
138
+ stdout=subprocess.PIPE,
139
+ stderr=subprocess.PIPE,
140
+ check=True,
141
+ )
142
+ output = subprocess.run(
143
+ filter_cmd,
144
+ shell=True,
145
+ stdout=subprocess.PIPE,
146
+ stderr=subprocess.PIPE,
147
+ check=True,
148
+ text=True,
149
+ )
150
+ except subprocess.CalledProcessError as init_e:
151
+ logger.warning(
152
+ _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG.format(
153
+ path=src_dir_path, error_msg=init_e.stderr
154
+ )
155
+ )
156
+ return excluded_list
157
+ if git_exclude_exists:
158
+ # removes all the files/dirs created with 'git init'
159
+ # under .git/ except .git/info/exclude
160
+ remove_files_cmd = (
161
+ f'find {expand_src_dir_path}'
162
+ f'/.git -path {git_exclude_path}'
163
+ ' -prune -o -type f -exec rm -f '
164
+ '{} +'
165
+ )
166
+ remove_dirs_cmd = (
167
+ f'find {expand_src_dir_path}'
168
+ f'/.git -path {git_exclude_path}'
169
+ ' -o -type d -empty -delete'
170
+ )
171
+ subprocess.run(
172
+ remove_files_cmd,
173
+ shell=True,
174
+ stdout=subprocess.PIPE,
175
+ stderr=subprocess.PIPE,
176
+ check=True,
177
+ )
178
+ subprocess.run(
179
+ remove_dirs_cmd,
180
+ shell=True,
181
+ stdout=subprocess.PIPE,
182
+ stderr=subprocess.PIPE,
183
+ check=True,
184
+ )
185
+
186
+ output_list = output.stdout.split('\n')
187
+ for line in output_list:
188
+ # FILTER_CMD outputs items preceded by '!!'
189
+ # to specify excluded files/dirs
190
+ # e.g., '!! mydir/' or '!! mydir/myfile.txt'
191
+ if line.startswith('!!'):
192
+ to_be_excluded = line[3:]
193
+ if line.endswith('/'):
194
+ # aws s3 sync and gsutil rsync require * to exclude
195
+ # files/dirs under the specified directory.
196
+ to_be_excluded += '*'
197
+ excluded_list.append(to_be_excluded)
198
+ return excluded_list
199
+
200
+
201
+ def get_excluded_files(src_dir_path: str) -> List[str]:
202
+ # TODO: this could return a huge list of files,
203
+ # should think of ways to optimize.
204
+ """List files and directories to be excluded."""
205
+ expand_src_dir_path = os.path.expanduser(src_dir_path)
206
+ konduktorignore_path = os.path.join(
207
+ expand_src_dir_path, constants.KONDUKTOR_IGNORE_FILE
208
+ )
209
+ if os.path.exists(konduktorignore_path):
210
+ logger.info(
211
+ f' {colorama.Style.DIM}'
212
+ f'Excluded files to sync to cluster based on '
213
+ f'{constants.KONDUKTOR_IGNORE_FILE}.'
214
+ f'{colorama.Style.RESET_ALL}'
215
+ )
216
+ return get_excluded_files_from_konduktorignore(src_dir_path)
217
+ logger.info(
218
+ f' {colorama.Style.DIM}'
219
+ f'Excluded files to sync to cluster based on '
220
+ f'{constants.GIT_IGNORE_FILE}.'
221
+ f'{colorama.Style.RESET_ALL}'
222
+ )
223
+ return get_excluded_files_from_gitignore(src_dir_path)
224
+
225
+
226
+ class AbstractStore:
227
+ """AbstractStore abstracts away the different storage types exposed by
228
+ different clouds.
229
+
230
+ Storage objects are backed by AbstractStores, each representing a store
231
+ present in a cloud.
232
+ """
233
+
234
+ class StoreMetadata:
235
+ """A pickle-able representation of Store
236
+
237
+ Allows store objects to be written to and reconstructed from
238
+ global_user_state.
239
+ """
240
+
241
+ def __init__(
242
+ self,
243
+ *,
244
+ name: str,
245
+ source: Optional['SourceType'],
246
+ region: Optional[str] = None,
247
+ is_sky_managed: Optional[bool] = None,
248
+ _bucket_sub_path: Optional[str] = None,
249
+ ):
250
+ self.name = name
251
+ self.source = source
252
+ self.region = region
253
+ self.is_sky_managed = is_sky_managed
254
+ self._bucket_sub_path = _bucket_sub_path
255
+
256
+ def __repr__(self):
257
+ return (
258
+ f'StoreMetadata('
259
+ f'\n\tname={self.name},'
260
+ f'\n\tsource={self.source},'
261
+ f'\n\tregion={self.region},'
262
+ f'\n\tis_sky_managed={self.is_sky_managed},'
263
+ f'\n\t_bucket_sub_path={self._bucket_sub_path}'
264
+ )
265
+
266
+ def __init__(
267
+ self,
268
+ name: str,
269
+ source: Optional['SourceType'],
270
+ region: Optional[str] = None,
271
+ is_sky_managed: Optional[bool] = None,
272
+ sync_on_reconstruction: Optional[bool] = True,
273
+ _bucket_sub_path: Optional[str] = None,
274
+ ):
275
+ """Initialize AbstractStore
276
+
277
+ Args:
278
+ name: Store name
279
+ source: Data source for the store
280
+ region: Region to place the bucket in
281
+ is_sky_managed: Whether the store is managed by Sky. If None, it
282
+ must be populated by the implementing class during initialization.
283
+
284
+ Raises:
285
+ StorageBucketCreateError: If bucket creation fails
286
+ StorageBucketGetError: If fetching existing bucket fails
287
+ StorageInitError: If general initialization fails
288
+ """
289
+ self.name = name
290
+ self.source = source
291
+ self.region = region
292
+ self.is_sky_managed = is_sky_managed
293
+ self.sync_on_reconstruction = sync_on_reconstruction
294
+ self._bucket_sub_path = _bucket_sub_path
295
+ # Whether sky is responsible for the lifecycle of the Store.
296
+ self._validate()
297
+ self.initialize()
298
+
299
+ @property
300
+ def bucket_sub_path(self) -> Optional[str]:
301
+ """Get the bucket_sub_path."""
302
+ return self._bucket_sub_path
303
+
304
+ @classmethod
305
+ def from_metadata(cls, metadata: StoreMetadata, **override_args):
306
+ """Create a Store from a StoreMetadata object.
307
+
308
+ Used when reconstructing Storage and Store objects from
309
+ global_user_state.
310
+ """
311
+ return cls(
312
+ name=override_args.get('name', metadata.name),
313
+ source=override_args.get('source', metadata.source),
314
+ region=override_args.get('region', metadata.region),
315
+ )
316
+
317
+ def get_metadata(self) -> StoreMetadata:
318
+ return self.StoreMetadata(
319
+ name=self.name,
320
+ source=self.source,
321
+ region=self.region,
322
+ )
323
+
324
+ def initialize(self):
325
+ """Initializes the Store object on the cloud.
326
+
327
+ Initialization involves fetching bucket if exists, or creating it if
328
+ it does not.
329
+
330
+ Raises:
331
+ StorageBucketCreateError: If bucket creation fails
332
+ StorageBucketGetError: If fetching existing bucket fails
333
+ StorageInitError: If general initialization fails.
334
+ """
335
+ pass
336
+
337
+ def _validate(self) -> None:
338
+ """Runs validation checks on class args"""
339
+ pass
340
+
341
+ def upload(self) -> None:
342
+ """Uploads source to the store bucket
343
+
344
+ Upload must be called by the Storage handler - it is not called on
345
+ Store initialization.
346
+ """
347
+ raise NotImplementedError
348
+
349
+ def delete(self) -> None:
350
+ """Removes the Storage object from the cloud."""
351
+ raise NotImplementedError
352
+
353
+ def get_handle(self) -> 'StorageHandle':
354
+ """Returns the storage handle for use by the execution backend to attach
355
+ to VM/containers
356
+ """
357
+ raise NotImplementedError
358
+
359
+ def download_remote_dir(self, local_path: str) -> None:
360
+ """Downloads directory from remote bucket to the specified
361
+ local_path
362
+
363
+ Args:
364
+ local_path: Local path on user's device
365
+ """
366
+ raise NotImplementedError
367
+
368
+ def _download_file(self, remote_path: str, local_path: str) -> None:
369
+ """Downloads file from remote to local on Store
370
+
371
+ Args:
372
+ remote_path: str; Remote file path on Store
373
+ local_path: str; Local file path on user's device
374
+ """
375
+ raise NotImplementedError
376
+
377
+ def mount_command(self, mount_path: str) -> str:
378
+ """Returns the command to mount the Store to the specified mount_path.
379
+
380
+ Includes the setup commands to install mounting tools.
381
+
382
+ Args:
383
+ mount_path: str; Mount path on remote server
384
+ """
385
+ raise NotImplementedError
386
+
387
+ def __deepcopy__(self, memo):
388
+ # S3 Client and GCS Client cannot be deep copied, hence the
389
+ # original Store object is returned
390
+ return self
391
+
392
+ def _validate_existing_bucket(self):
393
+ """Validates the storage fields for existing buckets."""
394
+ # Check if 'source' is None, this is only allowed when Storage is in
395
+ # either MOUNT mode or COPY mode with sky-managed storage.
396
+ # Note: In COPY mode, a 'source' being None with non-sky-managed
397
+ # storage is already handled as an error in _validate_storage_spec.
398
+ if self.source is None:
399
+ # Retrieve a handle associated with the storage name.
400
+ # This handle links to sky managed storage if it exists.
401
+ raise NotImplementedError("We don't handle empty sources for now")
402
+
403
+ @classmethod
404
+ def check_credentials(cls):
405
+ """
406
+ Check if the credentials stored on client are valid for the store.
407
+ This function always runs after check_credentials_from_secret. If
408
+ the credentials work, we create/update the secret on the cluster.
409
+ """
410
+ raise NotImplementedError
411
+
412
+ @classmethod
413
+ def set_secret_credentials(cls):
414
+ """
415
+ Set the k8s secret storing the credentials for the store.
416
+ """
417
+ raise NotImplementedError
418
+
419
+ # TODO(zhwu): Make the return type immutable.
420
+ @classmethod
421
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
422
+ """(Advanced) Returns all available user identities of this cloud.
423
+
424
+ The user "identity" is associated with each SkyPilot cluster they
425
+ create. This is used in protecting cluster operations, such as
426
+ provision, teardown and status refreshing, in a multi-identity
427
+ scenario, where the same user/device can switch between different
428
+ cloud identities. We check that the user identity matches before:
429
+ - Provisioning/starting a cluster
430
+ - Stopping/tearing down a cluster
431
+ - Refreshing the status of a cluster
432
+
433
+ Design choices:
434
+ 1. We allow the operations that can correctly work with a different
435
+ user identity, as a user should have full control over all their
436
+ clusters (no matter which identity it belongs to), e.g.,
437
+ submitting jobs, viewing logs, auto-stopping, etc.
438
+ 2. A cloud implementation can optionally switch between different
439
+ identities if required for cluster operations. In this case,
440
+ the cloud implementation should return multiple identities
441
+ as a list. E.g., our Kubernetes implementation can use multiple
442
+ kubeconfig contexts to switch between different identities.
443
+
444
+ The choice of what constitutes an identity is up to each cloud's
445
+ implementation. In general, to suffice for the above purposes,
446
+ ensure that different identities should imply different sets of
447
+ resources are used when the user invoked each cloud's default
448
+ CLI/API.
449
+
450
+ An identity is a list of strings. The list is in the order of
451
+ strictness, i.e., the first element is the most strict identity, and
452
+ the last element is the least strict identity.
453
+ When performing an identity check between the current active identity
454
+ and the owner identity associated with a cluster, we compare the two
455
+ lists in order: if a position does not match, we go to the next. To
456
+ see an example, see the docstring of the AWS.get_user_identities.
457
+
458
+ Example identities (see cloud implementations):
459
+ - AWS: [UserId, AccountId]
460
+ - GCP: [email address + project ID]
461
+ - Azure: [email address + subscription ID]
462
+ - Kubernetes: [context name]
463
+
464
+ Example return values:
465
+ - AWS: [[UserId, AccountId]]
466
+ - GCP: [[email address + project ID]]
467
+ - Azure: [[email address + subscription ID]]
468
+ - Kubernetes: [[current active context], [context 2], ...]
469
+
470
+ Returns:
471
+ None if the cloud does not have a concept of user identity
472
+ (access protection will be disabled for these clusters);
473
+ otherwise a list of available identities with the current active
474
+ identity being the first element. Most clouds have only one identity
475
+ available, so the returned list will only have one element: the
476
+ current active identity.
477
+
478
+ Raises:
479
+ exceptions.CloudUserIdentityError: If the user identity cannot be
480
+ retrieved.
481
+ """
482
+ return None
483
+
484
+ @classmethod
485
+ def get_active_user_identity(cls) -> Optional[List[str]]:
486
+ """Returns currently active user identity of this cloud
487
+
488
+ See get_user_identities for definition of user identity.
489
+
490
+ Returns:
491
+ None if the cloud does not have a concept of user identity;
492
+ otherwise the current active identity.
493
+ """
494
+ identities = cls.get_user_identities()
495
+ return identities[0] if identities is not None else None
496
+
497
+ @classmethod
498
+ def get_k8s_credential_name(cls) -> str:
499
+ """Returns the name of the k8s secret storing the credentials for the store."""
500
+ raise NotImplementedError