konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,906 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Google Cloud Platform Storage."""
14
+
15
+ import enum
16
+ import os
17
+ import re
18
+ import shlex
19
+ import subprocess
20
+ import time
21
+ import typing
22
+ from typing import List, Optional, Tuple
23
+
24
+ import colorama
25
+
26
+ if typing.TYPE_CHECKING:
27
+ from google.cloud import storage as gcs_storage
28
+
29
+ from konduktor import logging
30
+ from konduktor.adaptors import gcp
31
+ from konduktor.data import constants, data_utils, storage_utils
32
+ from konduktor.data.gcp import utils
33
+ from konduktor.utils import (
34
+ base64_utils,
35
+ common_utils,
36
+ exceptions,
37
+ kubernetes_utils,
38
+ rich_utils,
39
+ ux_utils,
40
+ )
41
+
42
+ logger = logging.get_logger(__name__)
43
+
44
+ # Maximum number of concurrent rsync upload processes
45
+ _MAX_CONCURRENT_UPLOADS = 32
46
+
47
+ # Env var pointing to any service account key. If it exists, this path takes
48
+ # priority over the DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH below, and will be
49
+ # used instead for Konduktro-launched instances. This is the same behavior as
50
+ # gcloud:
51
+ # https://cloud.google.com/docs/authentication/provide-credentials-adc#local-key
52
+ _GCP_APPLICATION_CREDENTIAL_ENV = 'GOOGLE_APPLICATION_CREDENTIALS'
53
+ # NOTE: do not expanduser() on this path. It's used as a destination path on the
54
+ # remote cluster.
55
+ DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH: str = (
56
+ '~/.config/gcloud/' 'application_default_credentials.json'
57
+ )
58
+ DEFAULT_GCP_CREDENTIALS_DIR = '~/.config/gcloud'
59
+
60
+ # TODO(wei-lin): config_default may not be the config in use.
61
+ # See: https://github.com/skypilot-org/skypilot/pull/1539
62
+ # NOTE: do not expanduser() on this path. It's used as a destination path on the
63
+ # remote cluster.
64
+ GCP_CONFIG_PATH = '~/.config/gcloud/configurations/config_default'
65
+
66
+ # Minimum set of files under ~/.config/gcloud that grant GCP access.
67
+ _CREDENTIAL_FILES = [
68
+ 'credentials.db',
69
+ 'access_tokens.db',
70
+ 'configurations',
71
+ 'legacy_credentials',
72
+ 'active_config',
73
+ 'application_default_credentials.json',
74
+ ]
75
+
76
+ # k8s secret name for gcp credentials
77
+ GCP_SECRET_NAME = 'gcpcredentials'
78
+ GCP_CREDENTIALS_KEY = 'gcpcredentials'
79
+
80
+ # NOTE: do not expanduser() on this path. It's used as a destination path on the
81
+ # remote cluster.
82
+ _GCLOUD_INSTALLATION_LOG = '~/.konduktor/logs/gcloud_installation.log'
83
+ _GCLOUD_VERSION = '424.0.0'
84
+ # Need to be run with /bin/bash
85
+ # We factor out the installation logic to keep it align in both spot
86
+ # controller and cloud stores.
87
+ GOOGLE_SDK_INSTALLATION_COMMAND: str = f'pushd /tmp &>/dev/null && \
88
+ {{ gcloud --help > /dev/null 2>&1 || \
89
+ {{ mkdir -p {os.path.dirname(_GCLOUD_INSTALLATION_LOG)} && \
90
+ wget --quiet https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-{_GCLOUD_VERSION}-linux-x86_64.tar.gz > {_GCLOUD_INSTALLATION_LOG} && \
91
+ tar xzf google-cloud-sdk-{_GCLOUD_VERSION}-linux-x86_64.tar.gz >> {_GCLOUD_INSTALLATION_LOG} && \
92
+ rm -rf ~/google-cloud-sdk >> {_GCLOUD_INSTALLATION_LOG} && \
93
+ mv google-cloud-sdk ~/ && \
94
+ ~/google-cloud-sdk/install.sh -q >> {_GCLOUD_INSTALLATION_LOG} 2>&1 && \
95
+ echo "source ~/google-cloud-sdk/path.bash.inc > /dev/null 2>&1" >> ~/.bashrc && \
96
+ source ~/google-cloud-sdk/path.bash.inc >> {_GCLOUD_INSTALLATION_LOG} 2>&1; }}; }} && \
97
+ popd &>/dev/null' # noqa: E501
98
+
99
+
100
+ class GCPIdentityType(enum.Enum):
101
+ """GCP identity type.
102
+
103
+ The account type is determined by the current user identity, based on
104
+ the identity email.
105
+ """
106
+
107
+ # Example of a service account email:
108
+ # skypilot-v1@xxxx.iam.gserviceaccount.com
109
+ SERVICE_ACCOUNT = 'iam.gserviceaccount.com'
110
+
111
+ SHARED_CREDENTIALS_FILE = ''
112
+
113
+ def can_credential_expire(self) -> bool:
114
+ return self == GCPIdentityType.SHARED_CREDENTIALS_FILE
115
+
116
+
117
+ def _run_output(cmd):
118
+ proc = subprocess.run(
119
+ cmd, shell=True, check=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE
120
+ )
121
+ return proc.stdout.decode('ascii')
122
+
123
+
124
+ def is_api_disabled(endpoint: str, project_id: str) -> bool:
125
+ proc = subprocess.run(
126
+ (
127
+ f'gcloud services list --project {project_id} '
128
+ f' | grep {endpoint}.googleapis.com'
129
+ ),
130
+ check=False,
131
+ shell=True,
132
+ stderr=subprocess.PIPE,
133
+ stdout=subprocess.PIPE,
134
+ )
135
+ return proc.returncode != 0
136
+
137
+
138
+ class GcsStore(storage_utils.AbstractStore):
139
+ """GcsStore inherits from Storage Object and represents the backend
140
+ for GCS buckets.
141
+ """
142
+
143
+ # k8s secret name for gcp credentials
144
+ _GCP_SECRET_NAME = f'{GCP_SECRET_NAME}-{common_utils.user_and_hostname_hash()}'
145
+ _GCP_CREDENTIALS_KEY = GCP_CREDENTIALS_KEY
146
+
147
+ _ACCESS_DENIED_MESSAGE = 'AccessDeniedException'
148
+
149
+ _INDENT_PREFIX = ' '
150
+ _DEPENDENCY_HINT = (
151
+ 'GCP tools are not installed. Run the following commands:\n'
152
+ # Install the Google Cloud SDK:
153
+ f'{_INDENT_PREFIX} $ pip install google-api-python-client\n'
154
+ f'{_INDENT_PREFIX} $ conda install -c conda-forge '
155
+ 'google-cloud-sdk -y'
156
+ )
157
+
158
+ _CREDENTIAL_HINT = (
159
+ 'Run the following commands:\n'
160
+ # This authenticates the CLI to make `gsutil` work:
161
+ f'{_INDENT_PREFIX} $ gcloud init\n'
162
+ # This will generate
163
+ # ~/.config/gcloud/application_default_credentials.json.
164
+ f'{_INDENT_PREFIX} $ gcloud auth application-default login\n'
165
+ f'{_INDENT_PREFIX}For more info: '
166
+ 'https://konduktor.readthedocs.io/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # noqa: E501
167
+ )
168
+ _APPLICATION_CREDENTIAL_HINT = (
169
+ 'Run the following commands:\n'
170
+ f'{_INDENT_PREFIX} $ gcloud auth application-default login\n'
171
+ f'{_INDENT_PREFIX}Or set the environment variable '
172
+ 'GOOGLE_APPLICATION_CREDENTIALS '
173
+ 'to the path of your service account key file.\n'
174
+ f'{_INDENT_PREFIX}For more info: '
175
+ 'https://konduktor.readthedocs.ioo/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # noqa: E501
176
+ )
177
+
178
+ _REPR = 'GcsStore'
179
+
180
+ def __init__(
181
+ self,
182
+ name: str,
183
+ source: str,
184
+ region: Optional[str] = 'us-central1',
185
+ is_sky_managed: Optional[bool] = False,
186
+ sync_on_reconstruction: Optional[bool] = True,
187
+ _bucket_sub_path: Optional[str] = None,
188
+ ):
189
+ self.client: 'gcs_storage.Client'
190
+ self.bucket: 'constants.StorageHandle'
191
+ super().__init__(
192
+ name,
193
+ source,
194
+ region,
195
+ is_sky_managed,
196
+ sync_on_reconstruction,
197
+ _bucket_sub_path,
198
+ )
199
+
200
+ def __repr__(self):
201
+ return self._REPR
202
+
203
+ def _validate(self):
204
+ if self.source is not None and isinstance(self.source, str):
205
+ # if self.source.startswith('s3://'):
206
+ # assert self.name == data_utils.split_s3_path(self.source)[0], (
207
+ # 'S3 Bucket is specified as path, the name should be the'
208
+ # ' same as S3 bucket.')
209
+ # assert data_utils.verify_s3_bucket(self.name), (
210
+ # f'Source specified as {self.source}, an S3 bucket. ',
211
+ # 'S3 Bucket should exist.')
212
+ if self.source.startswith('gs://'):
213
+ assert self.name == data_utils.split_gcs_path(self.source)[0], (
214
+ 'GCS Bucket is specified as path, the name should be '
215
+ 'the same as GCS bucket.'
216
+ )
217
+ # elif data_utils.is_az_container_endpoint(self.source):
218
+ # storage_account_name, container_name, _ = (
219
+ # data_utils.split_az_path(self.source))
220
+ # assert self.name == container_name, (
221
+ # 'Azure bucket is specified as path, the name should be '
222
+ # 'the same as Azure bucket.')
223
+ # assert data_utils.verify_az_bucket(
224
+ # storage_account_name, self.name), (
225
+ # f'Source specified as {self.source}, an Azure bucket. '
226
+ # 'Azure bucket should exist.')
227
+ # elif self.source.startswith('r2://'):
228
+ # assert self.name == data_utils.split_r2_path(self.source)[0], (
229
+ # 'R2 Bucket is specified as path, the name should be '
230
+ # 'the same as R2 bucket.')
231
+ # assert data_utils.verify_r2_bucket(self.name), (
232
+ # f'Source specified as {self.source}, a R2 bucket. ',
233
+ # 'R2 Bucket should exist.')
234
+ # elif self.source.startswith('cos://'):
235
+ # assert self.name == data_utils.split_cos_path(self.source)[0], (
236
+ # 'COS Bucket is specified as path, the name should be '
237
+ # 'the same as COS bucket.')
238
+ # assert data_utils.verify_ibm_cos_bucket(self.name), (
239
+ # f'Source specified as {self.source}, a COS bucket. ',
240
+ # 'COS Bucket should exist.')
241
+ # Validate name
242
+ self.name = self.validate_name(self.name)
243
+
244
+ @classmethod
245
+ def validate_name(cls, name: str) -> str:
246
+ """Validates the name of the GCS store.
247
+
248
+ Source for rules: https://cloud.google.com/storage/docs/buckets#naming
249
+ """
250
+
251
+ def _raise_no_traceback_name_error(err_str):
252
+ with ux_utils.print_exception_no_traceback():
253
+ raise exceptions.StorageNameError(err_str)
254
+
255
+ if name is not None and isinstance(name, str):
256
+ # Check for overall length
257
+ if not 3 <= len(name) <= 222:
258
+ _raise_no_traceback_name_error(
259
+ f'Invalid store name: name {name} must contain 3-222 ' 'characters.'
260
+ )
261
+
262
+ # Check for valid characters and start/end with a number or letter
263
+ pattern = r'^[a-z0-9][-a-z0-9._]*[a-z0-9]$'
264
+ if not re.match(pattern, name):
265
+ _raise_no_traceback_name_error(
266
+ f'Invalid store name: name {name} can only contain '
267
+ 'lowercase letters, numeric characters, dashes (-), '
268
+ 'underscores (_), and dots (.). Spaces are not allowed. '
269
+ 'Names must start and end with a number or letter.'
270
+ )
271
+
272
+ # Check for 'goog' prefix and 'google' in the name
273
+ if name.startswith('goog') or any(
274
+ s in name for s in ['google', 'g00gle', 'go0gle', 'g0ogle']
275
+ ):
276
+ _raise_no_traceback_name_error(
277
+ f'Invalid store name: name {name} cannot begin with the '
278
+ '"goog" prefix or contain "google" in various forms.'
279
+ )
280
+
281
+ # Check for dot-separated components length
282
+ components = name.split('.')
283
+ if any(len(component) > 63 for component in components):
284
+ _raise_no_traceback_name_error(
285
+ 'Invalid store name: Dot-separated components in name '
286
+ f'{name} can be no longer than 63 characters.'
287
+ )
288
+
289
+ if '..' in name or '.-' in name or '-.' in name:
290
+ _raise_no_traceback_name_error(
291
+ f'Invalid store name: name {name} must not contain two '
292
+ 'adjacent periods or a dot next to a hyphen.'
293
+ )
294
+
295
+ # Check for IP address format
296
+ ip_pattern = r'^(?:\d{1,3}\.){3}\d{1,3}$'
297
+ if re.match(ip_pattern, name):
298
+ _raise_no_traceback_name_error(
299
+ f'Invalid store name: name {name} cannot be represented as '
300
+ 'an IP address in dotted-decimal notation '
301
+ '(for example, 192.168.5.4).'
302
+ )
303
+ else:
304
+ _raise_no_traceback_name_error('Store name must be specified.')
305
+ return name
306
+
307
+ def initialize(self):
308
+ """Initializes the GCS store object on the cloud.
309
+
310
+ Initialization involves fetching bucket if exists, or creating it if
311
+ it does not.
312
+
313
+ Raises:
314
+ StorageBucketCreateError: If bucket creation fails
315
+ StorageBucketGetError: If fetching existing bucket fails
316
+ StorageInitError: If general initialization fails.
317
+ """
318
+ self.client = gcp.storage_client()
319
+ self.bucket, is_new_bucket = self._get_bucket()
320
+ if self.is_sky_managed is None:
321
+ # If is_sky_managed is not specified, then this is a new storage
322
+ # object (i.e., did not exist in global_user_state) and we should
323
+ # set the is_sky_managed property.
324
+ # If is_sky_managed is specified, then we take no action.
325
+ self.is_sky_managed = is_new_bucket
326
+
327
+ def upload(self):
328
+ """Uploads source to store bucket.
329
+
330
+ Upload must be called by the Storage handler - it is not called on
331
+ Store initialization.
332
+
333
+ Raises:
334
+ StorageUploadError: if upload fails.
335
+ """
336
+ try:
337
+ if isinstance(self.source, list):
338
+ self.batch_gsutil_rsync(self.source, create_dirs=True)
339
+ elif self.source is not None:
340
+ if self.source.startswith('gs://'):
341
+ pass
342
+ elif self.source.startswith('s3://'):
343
+ self._transfer_to_gcs()
344
+ elif self.source.startswith('r2://'):
345
+ self._transfer_to_gcs()
346
+ else:
347
+ # If a single directory is specified in source, upload
348
+ # contents to root of bucket by suffixing /*.
349
+ self.batch_gsutil_rsync([self.source])
350
+ except exceptions.StorageUploadError:
351
+ raise
352
+ except Exception as e:
353
+ raise exceptions.StorageUploadError(
354
+ f'Upload failed for store {self.name}'
355
+ ) from e
356
+
357
+ def delete(self) -> None:
358
+ deleted_by_skypilot = self._delete_gcs_bucket(self.name)
359
+ if deleted_by_skypilot:
360
+ msg_str = f'Deleted GCS bucket {self.name}.'
361
+ else:
362
+ msg_str = (
363
+ f'GCS bucket {self.name} may have been deleted '
364
+ f'externally. Removing from local state.'
365
+ )
366
+ logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}')
367
+
368
+ def get_handle(self) -> 'constants.StorageHandle':
369
+ return self.client.get_bucket(self.name)
370
+
371
+ def batch_gsutil_cp(
372
+ self, source_path_list: List['constants.Path'], create_dirs: bool = False
373
+ ) -> None:
374
+ """Invokes gsutil cp -n to batch upload a list of local paths
375
+
376
+ -n flag to gsutil cp checks the existence of an object before uploading,
377
+ making it similar to gsutil rsync. Since it allows specification of a
378
+ list of files, it is faster than calling gsutil rsync on each file.
379
+ However, unlike rsync, files are compared based on just their filename,
380
+ and any updates to a file would not be copied to the bucket.
381
+ """
382
+ # Generate message for upload
383
+ if len(source_path_list) > 1:
384
+ source_message = f'{len(source_path_list)} paths'
385
+ else:
386
+ source_message = source_path_list[0]
387
+
388
+ # If the source_path list contains a directory, then gsutil cp -n
389
+ # copies the dir as is to the root of the bucket. To copy the
390
+ # contents of directory to the root, add /* to the directory path
391
+ # e.g., /mydir/*
392
+ source_path_list = [
393
+ str(path) + '/*' if (os.path.isdir(path) and not create_dirs) else str(path)
394
+ for path in source_path_list
395
+ ]
396
+ copy_list = '\n'.join(
397
+ os.path.abspath(os.path.expanduser(p)) for p in source_path_list
398
+ )
399
+ gsutil_alias, alias_gen = data_utils.get_gsutil_command()
400
+ sub_path = f'/{self._bucket_sub_path}' if self._bucket_sub_path else ''
401
+ sync_command = (
402
+ f'{alias_gen}; echo "{copy_list}" | {gsutil_alias} '
403
+ f'cp -e -n -r -I gs://{self.name}{sub_path}'
404
+ )
405
+
406
+ log_path = logging.generate_tmp_logging_file_path(
407
+ constants._STORAGE_LOG_FILE_NAME
408
+ )
409
+
410
+ with rich_utils.safe_status(
411
+ ux_utils.spinner_message(
412
+ f'Syncing {source_message} -> ' f'gs://{self.name}{sub_path}'
413
+ )
414
+ ):
415
+ data_utils.run_upload_cli(
416
+ sync_command,
417
+ self._ACCESS_DENIED_MESSAGE,
418
+ bucket_name=self.name,
419
+ log_path=log_path,
420
+ )
421
+
422
+ def batch_gsutil_rsync(
423
+ self, source_path_list: List['constants.Path'], create_dirs: bool = False
424
+ ) -> None:
425
+ """Invokes gsutil rsync to batch upload a list of local paths
426
+
427
+ Since gsutil rsync does not support include commands, We use negative
428
+ look-ahead regex to exclude everything else than the path(s) we want to
429
+ upload.
430
+
431
+ Since gsutil rsync does not support batch operations, we construct
432
+ multiple commands to be run in parallel.
433
+
434
+ Args:
435
+ source_path_list: List of paths to local files or directories
436
+ create_dirs: If the local_path is a directory and this is set to
437
+ False, the contents of the directory are directly uploaded to
438
+ root of the bucket. If the local_path is a directory and this is
439
+ set to True, the directory is created in the bucket root and
440
+ contents are uploaded to it.
441
+ """
442
+
443
+ def get_file_sync_command(base_dir_path, file_names):
444
+ sync_format = '|'.join(file_names)
445
+ gsutil_alias, alias_gen = data_utils.get_gsutil_command()
446
+ base_dir_path = shlex.quote(base_dir_path)
447
+ sync_command = (
448
+ f'{alias_gen}; {gsutil_alias} '
449
+ f"rsync -e -x '^(?!{sync_format}$).*' "
450
+ f'{base_dir_path} gs://{self.name}{sub_path}'
451
+ )
452
+ return sync_command
453
+
454
+ def get_dir_sync_command(src_dir_path, dest_dir_name):
455
+ excluded_list = storage_utils.get_excluded_files(src_dir_path)
456
+ # we exclude .git directory from the sync
457
+ excluded_list.append(r'^\.git/.*$')
458
+ excludes = '|'.join(excluded_list)
459
+ gsutil_alias, alias_gen = data_utils.get_gsutil_command()
460
+ src_dir_path = shlex.quote(src_dir_path)
461
+ sync_command = (
462
+ f'{alias_gen}; {gsutil_alias} '
463
+ f"rsync -e -r -x '({excludes})' {src_dir_path} "
464
+ f'gs://{self.name}{sub_path}/{dest_dir_name}'
465
+ )
466
+ return sync_command
467
+
468
+ sub_path = f'/{self._bucket_sub_path}' if self._bucket_sub_path else ''
469
+ # Generate message for upload
470
+ if len(source_path_list) > 1:
471
+ source_message = f'{len(source_path_list)} paths'
472
+ else:
473
+ source_message = source_path_list[0]
474
+
475
+ log_path = logging.generate_tmp_logging_file_path(
476
+ constants._STORAGE_LOG_FILE_NAME
477
+ )
478
+ sync_path = f'{source_message} -> gs://{self.name}{sub_path}/'
479
+ with rich_utils.safe_status(
480
+ ux_utils.spinner_message(f'Syncing {sync_path}', log_path=log_path)
481
+ ):
482
+ data_utils.parallel_upload(
483
+ source_path_list,
484
+ get_file_sync_command,
485
+ get_dir_sync_command,
486
+ log_path,
487
+ self.name,
488
+ self._ACCESS_DENIED_MESSAGE,
489
+ create_dirs=create_dirs,
490
+ max_concurrent_uploads=_MAX_CONCURRENT_UPLOADS,
491
+ )
492
+ logger.info(
493
+ ux_utils.finishing_message(f'Storage synced: {sync_path}', log_path)
494
+ )
495
+
496
+ def _get_bucket(self) -> Tuple['constants.StorageHandle', bool]:
497
+ """Obtains the GCS bucket.
498
+ If the bucket exists, this method will connect to the bucket.
499
+
500
+ If the bucket does not exist, there are three cases:
501
+ 1) Raise an error if the bucket source starts with gs://
502
+ 2) Return None if bucket has been externally deleted and
503
+ sync_on_reconstruction is False
504
+ 3) Create and return a new bucket otherwise
505
+
506
+ Raises:
507
+ StorageSpecError: If externally created bucket is attempted to be
508
+ mounted without specifying storage source.
509
+ StorageBucketCreateError: If creating the bucket fails
510
+ StorageBucketGetError: If fetching a bucket fails
511
+ StorageExternalDeletionError: If externally deleted storage is
512
+ attempted to be fetched while reconstructing the storage for
513
+ 'sky storage delete' or 'sky start'
514
+ """
515
+ try:
516
+ bucket = self.client.get_bucket(self.name)
517
+ self._validate_existing_bucket()
518
+ return bucket, False
519
+ except gcp.not_found_exception() as e:
520
+ if isinstance(self.source, str) and self.source.startswith('gs://'):
521
+ with ux_utils.print_exception_no_traceback():
522
+ raise exceptions.StorageBucketGetError(
523
+ 'Attempted to use a non-existent bucket as a source: '
524
+ f'{self.source}'
525
+ ) from e
526
+ else:
527
+ # If bucket cannot be found (i.e., does not exist), it is to be
528
+ # created by Sky. However, creation is skipped if Store object
529
+ # is being reconstructed for deletion or re-mount with
530
+ # sky start, and error is raised instead.
531
+ if self.sync_on_reconstruction:
532
+ bucket = self._create_gcs_bucket(self.name, self.region)
533
+ return bucket, True
534
+ else:
535
+ # This is raised when Storage object is reconstructed for
536
+ # sky storage delete or to re-mount Storages with sky start
537
+ # but the storage is already removed externally.
538
+ raise exceptions.StorageExternalDeletionError(
539
+ 'Attempted to fetch a non-existent bucket: ' f'{self.name}'
540
+ ) from e
541
+ except gcp.forbidden_exception():
542
+ # Try public bucket to see if bucket exists
543
+ logger.info('External Bucket detected; Connecting to external bucket...')
544
+ try:
545
+ a_client = gcp.anonymous_storage_client()
546
+ bucket = a_client.bucket(self.name)
547
+ # Check if bucket can be listed/read from
548
+ next(bucket.list_blobs())
549
+ return bucket, False
550
+ except (gcp.not_found_exception(), ValueError) as e:
551
+ command = f'gsutil ls gs://{self.name}'
552
+ with ux_utils.print_exception_no_traceback():
553
+ raise exceptions.StorageBucketGetError(
554
+ f'Bucket {self.name} does not exist.'
555
+ + f' To debug, consider running `{command}`.'
556
+ ) from e
557
+
558
+ def _download_file(self, remote_path: str, local_path: str) -> None:
559
+ """Downloads file from remote to local on GS bucket
560
+
561
+ Args:
562
+ remote_path: str; Remote path on GS bucket
563
+ local_path: str; Local path on user's device
564
+ """
565
+ blob = self.bucket.blob(remote_path)
566
+ blob.download_to_filename(local_path, timeout=None)
567
+
568
+ def _create_gcs_bucket(
569
+ self, bucket_name: str, region='us-central1'
570
+ ) -> 'constants.StorageHandle':
571
+ """Creates GCS bucket with specific name in specific region
572
+
573
+ Args:
574
+ bucket_name: str; Name of bucket
575
+ region: str; Region name, e.g. us-central1, us-west1
576
+ """
577
+ try:
578
+ bucket = self.client.bucket(bucket_name)
579
+ bucket.storage_class = 'STANDARD'
580
+ new_bucket = self.client.create_bucket(bucket, location=region)
581
+ except Exception as e: # pylint: disable=broad-except
582
+ with ux_utils.print_exception_no_traceback():
583
+ raise exceptions.StorageBucketCreateError(
584
+ f'Attempted to create a bucket {self.name} but failed.'
585
+ ) from e
586
+ logger.info(
587
+ f' {colorama.Style.DIM}Created GCS bucket {new_bucket.name!r} in '
588
+ f'{new_bucket.location} with storage class '
589
+ f'{new_bucket.storage_class}{colorama.Style.RESET_ALL}'
590
+ )
591
+ return new_bucket
592
+
593
+ def _delete_gcs_bucket(self, bucket_name: str) -> bool:
594
+ """Deletes GCS bucket, including all objects in bucket
595
+
596
+ Args:
597
+ bucket_name: str; Name of bucket
598
+
599
+ Returns:
600
+ bool; True if bucket was deleted, False if it was deleted externally.
601
+ """
602
+
603
+ with rich_utils.safe_status(
604
+ ux_utils.spinner_message(f'Deleting GCS bucket [green]{bucket_name}')
605
+ ):
606
+ try:
607
+ self.client.get_bucket(bucket_name)
608
+ except gcp.forbidden_exception() as e:
609
+ # Try public bucket to see if bucket exists
610
+ with ux_utils.print_exception_no_traceback():
611
+ raise PermissionError(
612
+ 'External Bucket detected. User not allowed to delete '
613
+ 'external bucket.'
614
+ ) from e
615
+ except gcp.not_found_exception():
616
+ # If bucket does not exist, it may have been deleted externally.
617
+ # Do a no-op in that case.
618
+ logger.debug(f'Bucket {bucket_name} does not exist.')
619
+ return False
620
+ try:
621
+ gsutil_alias, alias_gen = data_utils.get_gsutil_command()
622
+ remove_obj_command = (
623
+ f'{alias_gen};{gsutil_alias} ' f'rm -r gs://{bucket_name}'
624
+ )
625
+ subprocess.check_output(
626
+ remove_obj_command,
627
+ stderr=subprocess.STDOUT,
628
+ shell=True,
629
+ executable='/bin/bash',
630
+ )
631
+ return True
632
+ except subprocess.CalledProcessError as e:
633
+ with ux_utils.print_exception_no_traceback():
634
+ raise exceptions.StorageBucketDeleteError(
635
+ f'Failed to delete GCS bucket {bucket_name}.'
636
+ f'Detailed error: {e.output}'
637
+ )
638
+
639
+ @classmethod
640
+ def _find_application_key_path(cls) -> str:
641
+ # Check the application default credentials in the environment variable.
642
+ # If the file does not exist, fallback to the default path.
643
+ application_key_path = os.environ.get(_GCP_APPLICATION_CREDENTIAL_ENV, None)
644
+ if application_key_path is not None:
645
+ if not os.path.isfile(os.path.expanduser(application_key_path)):
646
+ raise FileNotFoundError(
647
+ f'{_GCP_APPLICATION_CREDENTIAL_ENV}={application_key_path},'
648
+ ' but the file does not exist.'
649
+ )
650
+ return application_key_path
651
+ if not os.path.isfile(
652
+ os.path.expanduser(DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH)
653
+ ):
654
+ # Fallback to the default application credential path.
655
+ raise FileNotFoundError(DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH)
656
+ return DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH
657
+
658
+ @classmethod
659
+ def _get_identity_type(cls) -> Optional[GCPIdentityType]:
660
+ try:
661
+ account = cls.get_active_user_identity()
662
+ except exceptions.CloudUserIdentityError:
663
+ return None
664
+ if account is None:
665
+ return None
666
+ assert account is not None
667
+ if GCPIdentityType.SERVICE_ACCOUNT.value in account[0]:
668
+ return GCPIdentityType.SERVICE_ACCOUNT
669
+ return GCPIdentityType.SHARED_CREDENTIALS_FILE
670
+
671
+ @classmethod
672
+ def get_project_id(cls, dryrun: bool = False) -> str:
673
+ if dryrun:
674
+ return 'dryrun-project-id'
675
+ # pylint: disable=import-outside-toplevel
676
+ from google import auth # type: ignore
677
+
678
+ _, project_id = auth.default()
679
+ if project_id is None:
680
+ raise exceptions.CloudUserIdentityError(
681
+ 'Failed to get GCP project id. Please make sure you have '
682
+ 'run the following: \n'
683
+ f'{cls._INDENT_PREFIX}gcloud init; \n'
684
+ f'{cls._INDENT_PREFIX}gcloud auth application-default login'
685
+ )
686
+ return project_id
687
+
688
+ @classmethod
689
+ def get_user_identities(cls) -> List[List[str]]:
690
+ """Returns the email address + project id of the active user."""
691
+ try:
692
+ account = _run_output(
693
+ 'gcloud auth list --filter=status:ACTIVE ' '--format="value(account)"'
694
+ )
695
+ account = account.strip()
696
+ except subprocess.CalledProcessError as e:
697
+ with ux_utils.print_exception_no_traceback():
698
+ raise exceptions.CloudUserIdentityError(
699
+ f'Failed to get GCP user identity with unknown '
700
+ f'exception.\n'
701
+ ' Reason: '
702
+ f'{common_utils.format_exception(e, use_bracket=True)}'
703
+ ) from e
704
+ if not account:
705
+ with ux_utils.print_exception_no_traceback():
706
+ raise exceptions.CloudUserIdentityError(
707
+ 'No GCP account is activated. Try running `gcloud '
708
+ 'auth list --filter=status:ACTIVE '
709
+ '--format="value(account)"` and ensure it correctly '
710
+ 'returns the current user.'
711
+ )
712
+ try:
713
+ project_id = cls.get_project_id()
714
+ except Exception as e: # pylint: disable=broad-except
715
+ with ux_utils.print_exception_no_traceback():
716
+ raise exceptions.CloudUserIdentityError(
717
+ f'Failed to get GCP user identity with unknown '
718
+ f'exception.\n'
719
+ ' Reason: '
720
+ f'{common_utils.format_exception(e, use_bracket=True)}'
721
+ ) from e
722
+ # TODO: Return a list of identities in the profile when we support
723
+ # automatic switching for GCP. Currently we only support one identity.
724
+ return [[f'{account} [project_id={project_id}]']]
725
+
726
+ def get_active_user_identity_str(cls) -> Optional[str]:
727
+ user_identity = cls.get_active_user_identity()
728
+ if user_identity is None:
729
+ return None
730
+ return user_identity[0].replace('\n', '')
731
+
732
+ @classmethod
733
+ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
734
+ """
735
+ Check if the credentials are valid for GCS store.
736
+ """
737
+ try:
738
+ # Check google-api-python-client installation.
739
+ import googleapiclient # noqa: F401
740
+ from google import auth # type: ignore
741
+
742
+ # Check the installation of google-cloud-sdk.
743
+ _run_output('gcloud --version')
744
+ except (ImportError, subprocess.CalledProcessError) as e:
745
+ return False, (
746
+ f'{cls._DEPENDENCY_HINT}\n'
747
+ f'{cls._INDENT_PREFIX}Credentials may also need to be set. '
748
+ f'{cls._CREDENTIAL_HINT}\n'
749
+ f'{cls._INDENT_PREFIX}Details: '
750
+ f'{common_utils.format_exception(e, use_bracket=True)}'
751
+ )
752
+
753
+ identity_type = cls._get_identity_type()
754
+ if identity_type == GCPIdentityType.SHARED_CREDENTIALS_FILE:
755
+ # This files are only required when using the shared credentials
756
+ # to access GCP. They are not required when using service account.
757
+ try:
758
+ # These files are required because they will be synced to remote
759
+ # VMs for `gsutil` to access private storage buckets.
760
+ # `auth.default()` does not guarantee these files exist.
761
+ for file in [
762
+ '~/.config/gcloud/access_tokens.db',
763
+ '~/.config/gcloud/credentials.db',
764
+ ]:
765
+ if not os.path.isfile(os.path.expanduser(file)):
766
+ raise FileNotFoundError(file)
767
+ except FileNotFoundError as e:
768
+ return False, (
769
+ f'Credentails are not set. '
770
+ f'{cls._CREDENTIAL_HINT}\n'
771
+ f'{cls._INDENT_PREFIX}Details: '
772
+ f'{common_utils.format_exception(e, use_bracket=True)}'
773
+ )
774
+
775
+ try:
776
+ cls._find_application_key_path()
777
+ except FileNotFoundError as e:
778
+ return False, (
779
+ f'Application credentials are not set. '
780
+ f'{cls._APPLICATION_CREDENTIAL_HINT}\n'
781
+ f'{cls._INDENT_PREFIX}Details: '
782
+ f'{common_utils.format_exception(e, use_bracket=True)}'
783
+ )
784
+
785
+ try:
786
+ # Check if application default credentials are set.
787
+ project_id = cls.get_project_id()
788
+
789
+ # Check if the user is activated.
790
+ identity = cls.get_active_user_identity()
791
+ except (
792
+ auth.exceptions.DefaultCredentialsError,
793
+ exceptions.CloudUserIdentityError,
794
+ ) as e:
795
+ # See also: https://stackoverflow.com/a/53307505/1165051
796
+ return False, (
797
+ 'Getting project ID or user identity failed. You can debug '
798
+ 'with `gcloud auth list`. To fix this, '
799
+ f'{cls._CREDENTIAL_HINT[0].lower()}'
800
+ f'{cls._CREDENTIAL_HINT[1:]}\n'
801
+ f'{cls._INDENT_PREFIX}Details: '
802
+ f'{common_utils.format_exception(e, use_bracket=True)}'
803
+ )
804
+
805
+ # Check APIs.
806
+ apis = (
807
+ ('cloudresourcemanager', 'Cloud Resource Manager'),
808
+ ('iam', 'Identity and Access Management (IAM)'),
809
+ ('storage', 'Cloud Storage'),
810
+ )
811
+ enabled_api = False
812
+ for endpoint, display_name in apis:
813
+ if is_api_disabled(endpoint, project_id):
814
+ # For 'compute': ~55-60 seconds for the first run. If already
815
+ # enabled, ~1s. Other API endpoints take ~1-5s to enable.
816
+ if endpoint == 'compute':
817
+ suffix = ' (free of charge; this may take a minute)'
818
+ else:
819
+ suffix = ' (free of charge)'
820
+ print(f'\nEnabling {display_name} API{suffix}...')
821
+ t1 = time.time()
822
+ proc = subprocess.run(
823
+ f'gcloud services enable {endpoint}.googleapis.com '
824
+ f'--project {project_id}',
825
+ check=False,
826
+ shell=True,
827
+ stdout=subprocess.PIPE,
828
+ stderr=subprocess.STDOUT,
829
+ )
830
+ if proc.returncode == 0:
831
+ enabled_api = True
832
+ print(f'Done. Took {time.time() - t1:.1f} secs.')
833
+
834
+ if enabled_api:
835
+ print(
836
+ '\nHint: Enabled GCP API(s) may take a few minutes to take '
837
+ 'effect. If any Konduktor commands/calls failed, retry after '
838
+ 'some time.'
839
+ )
840
+
841
+ # noqa: F401
842
+ import google.auth
843
+
844
+ # This takes user's credential info from "~/.config/gcloud/application_default_credentials.json". # noqa: E501
845
+ credentials, project = google.auth.default()
846
+ crm = gcp.build(
847
+ 'cloudresourcemanager', 'v1', credentials=credentials, cache_discovery=False
848
+ )
849
+ gcp_minimal_permissions = utils.get_minimal_permissions()
850
+ permissions = {'permissions': gcp_minimal_permissions}
851
+ request = crm.projects().testIamPermissions(resource=project, body=permissions)
852
+ with ux_utils.print_exception_no_traceback():
853
+ ret_permissions = request.execute().get('permissions', [])
854
+ diffs = set(gcp_minimal_permissions).difference(set(ret_permissions))
855
+ if diffs:
856
+ identity_str = identity[0] if identity else None
857
+ return False, (
858
+ 'The following permissions are not enabled for the current '
859
+ f'GCP identity ({identity_str}):\n '
860
+ f'{diffs}\n '
861
+ 'For more details, visit: https://konduktor.readthedocs.io//en/latest/cloud-setup/cloud-permissions/gcp.html'
862
+ ) # noqa: E501
863
+ logger.info(
864
+ f'GCP credentials are valid '
865
+ f'for the current identity {logging.CHECK_MARK_EMOJI}'
866
+ )
867
+ logger.info('Creating k8s secret with GCP credentials...')
868
+ set_ok, result = cls.set_secret_credentials()
869
+ if not set_ok:
870
+ logger.error(f'Failed to create k8s secret with GCP credentials: {result}')
871
+ return False, result
872
+ return True, None
873
+
874
+ @classmethod
875
+ def set_secret_credentials(cls) -> Tuple[bool, Optional[str]]:
876
+ """
877
+ Set the k8s secret storing the GCP credentials
878
+ """
879
+ context = kubernetes_utils.get_current_kube_config_context_name()
880
+ namespace = kubernetes_utils.get_kube_config_context_namespace()
881
+ credentials_dir = os.environ.get('CLOUDSDK_CONFIG', DEFAULT_GCP_CREDENTIALS_DIR)
882
+ credentials_files = [
883
+ os.path.expanduser(os.path.join(credentials_dir, f))
884
+ for f in _CREDENTIAL_FILES
885
+ ]
886
+ ok, result = kubernetes_utils.set_secret(
887
+ secret_name=cls._GCP_SECRET_NAME,
888
+ namespace=namespace,
889
+ context=context,
890
+ secret_key=cls._GCP_CREDENTIALS_KEY,
891
+ secret_value=base64_utils.zip_base64encode(credentials_files),
892
+ )
893
+ if not ok:
894
+ logger.error(f'Failed to set GCP credentials in k8s secret: \n{result}')
895
+ return False, result
896
+ else:
897
+ logger.info(
898
+ f'GCP credentials set in k8s secret: {cls._GCP_SECRET_NAME} '
899
+ f'in namespace {namespace} in context {context} '
900
+ f'{logging.CHECK_MARK_EMOJI}'
901
+ )
902
+ return True, None
903
+
904
+ @classmethod
905
+ def get_k8s_credential_name(cls) -> str:
906
+ return cls._GCP_SECRET_NAME