skypilot-nightly 1.0.0.dev20241011__py3-none-any.whl → 1.0.0.dev20241013__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/azure.py +3 -1
  3. sky/adaptors/common.py +6 -2
  4. sky/backends/backend.py +9 -4
  5. sky/backends/backend_utils.py +13 -16
  6. sky/backends/cloud_vm_ray_backend.py +207 -161
  7. sky/backends/local_docker_backend.py +3 -1
  8. sky/benchmark/benchmark_utils.py +5 -4
  9. sky/cli.py +128 -31
  10. sky/clouds/service_catalog/aws_catalog.py +6 -7
  11. sky/clouds/service_catalog/common.py +4 -3
  12. sky/clouds/service_catalog/cudo_catalog.py +11 -1
  13. sky/core.py +4 -2
  14. sky/data/storage.py +44 -32
  15. sky/data/storage_utils.py +12 -7
  16. sky/exceptions.py +5 -0
  17. sky/execution.py +10 -24
  18. sky/jobs/__init__.py +2 -0
  19. sky/jobs/core.py +87 -7
  20. sky/jobs/utils.py +35 -19
  21. sky/optimizer.py +50 -37
  22. sky/provision/aws/config.py +15 -6
  23. sky/provision/azure/config.py +14 -3
  24. sky/provision/azure/instance.py +15 -9
  25. sky/provision/kubernetes/instance.py +3 -1
  26. sky/provision/kubernetes/utils.py +25 -0
  27. sky/provision/provisioner.py +63 -74
  28. sky/serve/core.py +42 -40
  29. sky/sky_logging.py +9 -5
  30. sky/skylet/log_lib.py +5 -4
  31. sky/skylet/providers/lambda_cloud/node_provider.py +1 -1
  32. sky/utils/cli_utils/status_utils.py +168 -21
  33. sky/utils/command_runner.py +11 -11
  34. sky/utils/common_utils.py +22 -5
  35. sky/utils/controller_utils.py +78 -29
  36. sky/utils/env_options.py +22 -7
  37. sky/utils/log_utils.py +39 -24
  38. sky/utils/resources_utils.py +23 -0
  39. sky/utils/rich_utils.py +55 -5
  40. sky/utils/ux_utils.py +63 -4
  41. {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/METADATA +1 -1
  42. {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/RECORD +46 -46
  43. {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/LICENSE +0 -0
  44. {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/WHEEL +0 -0
  45. {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/entry_points.txt +0 -0
  46. {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241013.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,16 @@
1
1
  """Utilities for sky status."""
2
- from typing import Any, Callable, Dict, List, Optional
2
+ from typing import Any, Callable, Dict, List, Optional, Tuple
3
3
 
4
4
  import click
5
5
  import colorama
6
6
 
7
7
  from sky import backends
8
+ from sky import clouds as sky_clouds
9
+ from sky import resources as resources_lib
8
10
  from sky import status_lib
11
+ from sky.provision.kubernetes import utils as kubernetes_utils
9
12
  from sky.skylet import constants
13
+ from sky.utils import common_utils
10
14
  from sky.utils import log_utils
11
15
  from sky.utils import resources_utils
12
16
 
@@ -19,25 +23,6 @@ _ClusterRecord = Dict[str, Any]
19
23
  _ClusterCostReportRecord = Dict[str, Any]
20
24
 
21
25
 
22
- def truncate_long_string(s: str, max_length: int = 35) -> str:
23
- if len(s) <= max_length:
24
- return s
25
- splits = s.split(' ')
26
- if len(splits[0]) > max_length:
27
- return splits[0][:max_length] + '...' # Use '…'?
28
- # Truncate on word boundary.
29
- i = 0
30
- total = 0
31
- for i, part in enumerate(splits):
32
- total += len(part)
33
- if total >= max_length:
34
- break
35
- prefix = ' '.join(splits[:i])
36
- if len(prefix) < max_length:
37
- prefix += s[len(prefix):max_length]
38
- return prefix + '...'
39
-
40
-
41
26
  class StatusColumn:
42
27
  """One column of the displayed cluster table"""
43
28
 
@@ -54,7 +39,7 @@ class StatusColumn:
54
39
  def calc(self, record):
55
40
  val = self.calc_func(record)
56
41
  if self.trunc_length != 0:
57
- val = truncate_long_string(str(val), self.trunc_length)
42
+ val = common_utils.truncate_long_string(str(val), self.trunc_length)
58
43
  return val
59
44
 
60
45
 
@@ -316,3 +301,165 @@ def _get_estimated_cost_for_cost_report(
316
301
  return '-'
317
302
 
318
303
  return f'$ {cost:.2f}'
304
+
305
+
306
+ def show_kubernetes_cluster_status_table(clusters: List[Any],
307
+ show_all: bool) -> None:
308
+ """Compute cluster table values and display for Kubernetes clusters."""
309
+ status_columns = [
310
+ StatusColumn('USER', lambda c: c['user']),
311
+ StatusColumn('NAME', lambda c: c['cluster_name']),
312
+ StatusColumn(
313
+ 'LAUNCHED',
314
+ lambda c: log_utils.readable_time_duration(c['launched_at'])),
315
+ StatusColumn('RESOURCES',
316
+ lambda c: c['resources_str'],
317
+ trunc_length=70 if not show_all else 0),
318
+ StatusColumn('STATUS', lambda c: c['status'].colored_str()),
319
+ # TODO(romilb): We should consider adding POD_NAME field here when --all
320
+ # is passed to help users fetch pod name programmatically.
321
+ ]
322
+
323
+ columns = [
324
+ col.name for col in status_columns if col.show_by_default or show_all
325
+ ]
326
+ cluster_table = log_utils.create_table(columns)
327
+
328
+ # Sort table by user, then by cluster name
329
+ sorted_clusters = sorted(clusters,
330
+ key=lambda c: (c['user'], c['cluster_name']))
331
+
332
+ for cluster in sorted_clusters:
333
+ row = []
334
+ for status_column in status_columns:
335
+ if status_column.show_by_default or show_all:
336
+ row.append(status_column.calc(cluster))
337
+ cluster_table.add_row(row)
338
+
339
+ if clusters:
340
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
341
+ f'SkyPilot clusters'
342
+ f'{colorama.Style.RESET_ALL}')
343
+ click.echo(cluster_table)
344
+ else:
345
+ click.echo('No SkyPilot resources found in the '
346
+ 'active Kubernetes context.')
347
+
348
+
349
+ def process_skypilot_pods(
350
+ pods: List[Any],
351
+ context: Optional[str] = None
352
+ ) -> Tuple[List[Dict[Any, Any]], Dict[str, Any], Dict[str, Any]]:
353
+ """Process SkyPilot pods on k8s to extract cluster and controller info.
354
+
355
+ Args:
356
+ pods: List of Kubernetes pod objects.
357
+ context: Kubernetes context name, used to detect GPU label formatter.
358
+
359
+ Returns:
360
+ A tuple containing:
361
+ - List of dictionaries with cluster information.
362
+ - Dictionary of job controller information.
363
+ - Dictionary of serve controller information.
364
+
365
+ Each dictionary contains the following keys:
366
+ 'cluster_name_on_cloud': The cluster_name_on_cloud used by SkyPilot
367
+ 'cluster_name': The cluster name without the user hash
368
+ 'user': The user who created the cluster. Fetched from pod label
369
+ 'status': The cluster status (assumed UP if pod exists)
370
+ 'pods': List of pod objects in the cluster
371
+ 'launched_at': Timestamp of when the cluster was launched
372
+ 'resources': sky.Resources object for the cluster
373
+ """
374
+ clusters: Dict[str, Dict] = {}
375
+ jobs_controllers: Dict[str, Dict] = {}
376
+ serve_controllers: Dict[str, Dict] = {}
377
+
378
+ for pod in pods:
379
+ cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
380
+ cluster_name = cluster_name_on_cloud.rsplit(
381
+ '-', 1
382
+ )[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
383
+
384
+ # Check if cluster name is name of a controller
385
+ # Can't use controller_utils.Controllers.from_name(cluster_name)
386
+ # because hash is different across users
387
+ if 'controller' in cluster_name_on_cloud:
388
+ start_time = pod.status.start_time.timestamp()
389
+ controller_info = {
390
+ 'cluster_name_on_cloud': cluster_name_on_cloud,
391
+ 'cluster_name': cluster_name,
392
+ 'user': pod.metadata.labels.get('skypilot-user'),
393
+ 'status': status_lib.ClusterStatus.UP,
394
+ # Assuming UP if pod exists
395
+ 'pods': [pod],
396
+ 'launched_at': start_time
397
+ }
398
+ if 'sky-jobs-controller' in cluster_name_on_cloud:
399
+ jobs_controllers[cluster_name_on_cloud] = controller_info
400
+ elif 'sky-serve-controller' in cluster_name_on_cloud:
401
+ serve_controllers[cluster_name_on_cloud] = controller_info
402
+
403
+ if cluster_name_on_cloud not in clusters:
404
+ # Parse the start time for the cluster
405
+ start_time = pod.status.start_time
406
+ if start_time is not None:
407
+ start_time = pod.status.start_time.timestamp()
408
+
409
+ # Parse resources
410
+ cpu_request = kubernetes_utils.parse_cpu_or_gpu_resource(
411
+ pod.spec.containers[0].resources.requests.get('cpu', '0'))
412
+ memory_request = kubernetes_utils.parse_memory_resource(
413
+ pod.spec.containers[0].resources.requests.get('memory', '0'),
414
+ unit='G')
415
+ gpu_count = kubernetes_utils.parse_cpu_or_gpu_resource(
416
+ pod.spec.containers[0].resources.requests.get(
417
+ 'nvidia.com/gpu', '0'))
418
+ if gpu_count > 0:
419
+ label_formatter, _ = (
420
+ kubernetes_utils.detect_gpu_label_formatter(context))
421
+ assert label_formatter is not None, (
422
+ 'GPU label formatter cannot be None if there are pods '
423
+ f'requesting GPUs: {pod.metadata.name}')
424
+ gpu_label = label_formatter.get_label_key()
425
+ # Get GPU name from pod node selector
426
+ if pod.spec.node_selector is not None:
427
+ gpu_name = label_formatter.get_accelerator_from_label_value(
428
+ pod.spec.node_selector.get(gpu_label))
429
+
430
+ resources = resources_lib.Resources(
431
+ cloud=sky_clouds.Kubernetes(),
432
+ cpus=int(cpu_request),
433
+ memory=int(memory_request),
434
+ accelerators=(f'{gpu_name}:{gpu_count}'
435
+ if gpu_count > 0 else None))
436
+ if pod.status.phase == 'Pending':
437
+ # If pod is pending, do not show it in the status
438
+ continue
439
+
440
+ clusters[cluster_name_on_cloud] = {
441
+ 'cluster_name_on_cloud': cluster_name_on_cloud,
442
+ 'cluster_name': cluster_name,
443
+ 'user': pod.metadata.labels.get('skypilot-user'),
444
+ 'status': status_lib.ClusterStatus.UP,
445
+ 'pods': [],
446
+ 'launched_at': start_time,
447
+ 'resources': resources,
448
+ }
449
+ else:
450
+ # Update start_time if this pod started earlier
451
+ pod_start_time = pod.status.start_time
452
+ if pod_start_time is not None:
453
+ pod_start_time = pod_start_time.timestamp()
454
+ if pod_start_time < clusters[cluster_name_on_cloud][
455
+ 'launched_at']:
456
+ clusters[cluster_name_on_cloud][
457
+ 'launched_at'] = pod_start_time
458
+ clusters[cluster_name_on_cloud]['pods'].append(pod)
459
+ # Update resources_str in clusters:
460
+ for cluster_name, cluster in clusters.items():
461
+ resources = cluster['resources']
462
+ num_pods = len(cluster['pods'])
463
+ resources_str = f'{num_pods}x {resources}'
464
+ cluster['resources_str'] = resources_str
465
+ return list(clusters.values()), jobs_controllers, serve_controllers
@@ -171,7 +171,7 @@ class CommandRunner:
171
171
  cmd: Union[str, List[str]],
172
172
  process_stream: bool,
173
173
  separate_stderr: bool,
174
- skip_lines: int,
174
+ skip_num_lines: int,
175
175
  source_bashrc: bool = False,
176
176
  ) -> str:
177
177
  """Returns the command to run."""
@@ -203,12 +203,12 @@ class CommandRunner:
203
203
  ]
204
204
  if not separate_stderr:
205
205
  command.append('2>&1')
206
- if not process_stream and skip_lines:
206
+ if not process_stream and skip_num_lines:
207
207
  command += [
208
208
  # A hack to remove the following bash warnings (twice):
209
209
  # bash: cannot set terminal process group
210
210
  # bash: no job control in this shell
211
- f'| stdbuf -o0 tail -n +{skip_lines}',
211
+ f'| stdbuf -o0 tail -n +{skip_num_lines}',
212
212
  # This is required to make sure the executor of command can get
213
213
  # correct returncode, since linux pipe is used.
214
214
  '; exit ${PIPESTATUS[0]}'
@@ -320,7 +320,7 @@ class CommandRunner:
320
320
  separate_stderr: bool = False,
321
321
  connect_timeout: Optional[int] = None,
322
322
  source_bashrc: bool = False,
323
- skip_lines: int = 0,
323
+ skip_num_lines: int = 0,
324
324
  **kwargs) -> Union[int, Tuple[int, str, str]]:
325
325
  """Runs the command on the cluster.
326
326
 
@@ -335,7 +335,7 @@ class CommandRunner:
335
335
  connect_timeout: timeout in seconds for the ssh connection.
336
336
  source_bashrc: Whether to source the ~/.bashrc before running the
337
337
  command.
338
- skip_lines: The number of lines to skip at the beginning of the
338
+ skip_num_lines: The number of lines to skip at the beginning of the
339
339
  output. This is used when the output is not processed by
340
340
  SkyPilot but we still want to get rid of some warning messages,
341
341
  such as SSH warnings.
@@ -529,7 +529,7 @@ class SSHCommandRunner(CommandRunner):
529
529
  separate_stderr: bool = False,
530
530
  connect_timeout: Optional[int] = None,
531
531
  source_bashrc: bool = False,
532
- skip_lines: int = 0,
532
+ skip_num_lines: int = 0,
533
533
  **kwargs) -> Union[int, Tuple[int, str, str]]:
534
534
  """Uses 'ssh' to run 'cmd' on a node with ip.
535
535
 
@@ -550,7 +550,7 @@ class SSHCommandRunner(CommandRunner):
550
550
  connect_timeout: timeout in seconds for the ssh connection.
551
551
  source_bashrc: Whether to source the bashrc before running the
552
552
  command.
553
- skip_lines: The number of lines to skip at the beginning of the
553
+ skip_num_lines: The number of lines to skip at the beginning of the
554
554
  output. This is used when the output is not processed by
555
555
  SkyPilot but we still want to get rid of some warning messages,
556
556
  such as SSH warnings.
@@ -573,7 +573,7 @@ class SSHCommandRunner(CommandRunner):
573
573
  command_str = self._get_command_to_run(cmd,
574
574
  process_stream,
575
575
  separate_stderr,
576
- skip_lines=skip_lines,
576
+ skip_num_lines=skip_num_lines,
577
577
  source_bashrc=source_bashrc)
578
578
  command = base_ssh_command + [shlex.quote(command_str)]
579
579
 
@@ -693,7 +693,7 @@ class KubernetesCommandRunner(CommandRunner):
693
693
  separate_stderr: bool = False,
694
694
  connect_timeout: Optional[int] = None,
695
695
  source_bashrc: bool = False,
696
- skip_lines: int = 0,
696
+ skip_num_lines: int = 0,
697
697
  **kwargs) -> Union[int, Tuple[int, str, str]]:
698
698
  """Uses 'kubectl exec' to run 'cmd' on a pod by its name and namespace.
699
699
 
@@ -713,7 +713,7 @@ class KubernetesCommandRunner(CommandRunner):
713
713
  connect_timeout: timeout in seconds for the pod connection.
714
714
  source_bashrc: Whether to source the bashrc before running the
715
715
  command.
716
- skip_lines: The number of lines to skip at the beginning of the
716
+ skip_num_lines: The number of lines to skip at the beginning of the
717
717
  output. This is used when the output is not processed by
718
718
  SkyPilot but we still want to get rid of some warning messages,
719
719
  such as SSH warnings.
@@ -751,7 +751,7 @@ class KubernetesCommandRunner(CommandRunner):
751
751
  command_str = self._get_command_to_run(cmd,
752
752
  process_stream,
753
753
  separate_stderr,
754
- skip_lines=skip_lines,
754
+ skip_num_lines=skip_num_lines,
755
755
  source_bashrc=source_bashrc)
756
756
  command = kubectl_base_command + [
757
757
  # It is important to use /bin/bash -c here to make sure we quote the
sky/utils/common_utils.py CHANGED
@@ -16,7 +16,6 @@ import time
16
16
  from typing import Any, Callable, Dict, List, Optional, Union
17
17
  import uuid
18
18
 
19
- import colorama
20
19
  import jinja2
21
20
  import jsonschema
22
21
  import yaml
@@ -479,11 +478,9 @@ def format_exception(e: Union[Exception, SystemExit, KeyboardInterrupt],
479
478
  Returns:
480
479
  A string that represents the exception.
481
480
  """
482
- bright = colorama.Style.BRIGHT
483
- reset = colorama.Style.RESET_ALL
484
481
  if use_bracket:
485
- return f'{bright}[{class_fullname(e.__class__)}]{reset} {e}'
486
- return f'{bright}{class_fullname(e.__class__)}:{reset} {e}'
482
+ return f'[{class_fullname(e.__class__)}] {e}'
483
+ return f'{class_fullname(e.__class__)}: {e}'
487
484
 
488
485
 
489
486
  def remove_color(s: str):
@@ -679,3 +676,23 @@ def deprecated_function(
679
676
  return func(*args, **kwargs)
680
677
 
681
678
  return new_func
679
+
680
+
681
+ def truncate_long_string(s: str, max_length: int = 35) -> str:
682
+ """Truncate a string to a maximum length, preserving whole words."""
683
+ if len(s) <= max_length:
684
+ return s
685
+ splits = s.split(' ')
686
+ if len(splits[0]) > max_length:
687
+ return splits[0][:max_length] + '...' # Use '…'?
688
+ # Truncate on word boundary.
689
+ i = 0
690
+ total = 0
691
+ for i, part in enumerate(splits):
692
+ total += len(part)
693
+ if total >= max_length:
694
+ break
695
+ prefix = ' '.join(splits[:i])
696
+ if len(prefix) < max_length:
697
+ prefix += s[len(prefix):max_length]
698
+ return prefix + '...'
@@ -28,6 +28,7 @@ from sky.serve import serve_utils
28
28
  from sky.skylet import constants
29
29
  from sky.utils import common_utils
30
30
  from sky.utils import env_options
31
+ from sky.utils import rich_utils
31
32
  from sky.utils import ux_utils
32
33
 
33
34
  if typing.TYPE_CHECKING:
@@ -192,7 +193,11 @@ def _get_cloud_dependencies_installation_commands(
192
193
  # TODO(tian): Make dependency installation command a method of cloud
193
194
  # class and get all installation command for enabled clouds.
194
195
  commands = []
195
- prefix_str = 'Check & install cloud dependencies on controller: '
196
+ # We use <step>/<total> instead of strong formatting, as we need to update
197
+ # the <total> at the end of the for loop, and python does not support
198
+ # partial string formatting.
199
+ prefix_str = ('[<step>/<total>] Check & install cloud dependencies '
200
+ 'on controller: ')
196
201
  # This is to make sure the shorter checking message does not have junk
197
202
  # characters from the previous message.
198
203
  empty_str = ' ' * 10
@@ -203,6 +208,7 @@ def _get_cloud_dependencies_installation_commands(
203
208
  # other clouds will install boto3 but not awscli.
204
209
  'pip list | grep awscli> /dev/null 2>&1 || pip install "urllib3<2" '
205
210
  'awscli>=1.27.10 "colorama<0.4.5" > /dev/null 2>&1')
211
+ setup_clouds: List[str] = []
206
212
  for cloud in sky_check.get_cached_enabled_clouds_or_refresh():
207
213
  if isinstance(
208
214
  clouds,
@@ -211,11 +217,16 @@ def _get_cloud_dependencies_installation_commands(
211
217
  # fluidstack and paperspace
212
218
  continue
213
219
  if isinstance(cloud, clouds.AWS):
214
- commands.append(f'echo -en "\\r{prefix_str}AWS{empty_str}" && ' +
220
+ step_prefix = prefix_str.replace('<step>',
221
+ str(len(setup_clouds) + 1))
222
+ commands.append(f'echo -en "\\r{step_prefix}AWS{empty_str}" && ' +
215
223
  aws_dependencies_installation)
224
+ setup_clouds.append(str(cloud))
216
225
  elif isinstance(cloud, clouds.Azure):
226
+ step_prefix = prefix_str.replace('<step>',
227
+ str(len(setup_clouds) + 1))
217
228
  commands.append(
218
- f'echo -en "\\r{prefix_str}Azure{empty_str}" && '
229
+ f'echo -en "\\r{step_prefix}Azure{empty_str}" && '
219
230
  'pip list | grep azure-cli > /dev/null 2>&1 || '
220
231
  'pip install "azure-cli>=2.31.0" azure-core '
221
232
  '"azure-identity>=1.13.0" azure-mgmt-network > /dev/null 2>&1')
@@ -225,9 +236,12 @@ def _get_cloud_dependencies_installation_commands(
225
236
  commands.append(
226
237
  'pip list | grep azure-storage-blob > /dev/null 2>&1 || '
227
238
  'pip install azure-storage-blob msgraph-sdk > /dev/null 2>&1')
239
+ setup_clouds.append(str(cloud))
228
240
  elif isinstance(cloud, clouds.GCP):
241
+ step_prefix = prefix_str.replace('<step>',
242
+ str(len(setup_clouds) + 1))
229
243
  commands.append(
230
- f'echo -en "\\r{prefix_str}GCP{empty_str}" && '
244
+ f'echo -en "\\r{step_prefix}GCP{empty_str}" && '
231
245
  'pip list | grep google-api-python-client > /dev/null 2>&1 || '
232
246
  'pip install "google-api-python-client>=2.69.0" '
233
247
  '> /dev/null 2>&1')
@@ -238,9 +252,12 @@ def _get_cloud_dependencies_installation_commands(
238
252
  'pip list | grep google-cloud-storage > /dev/null 2>&1 || '
239
253
  'pip install google-cloud-storage > /dev/null 2>&1')
240
254
  commands.append(f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
255
+ setup_clouds.append(str(cloud))
241
256
  elif isinstance(cloud, clouds.Kubernetes):
257
+ step_prefix = prefix_str.replace('<step>',
258
+ str(len(setup_clouds) + 1))
242
259
  commands.append(
243
- f'echo -en "\\r{prefix_str}Kubernetes{empty_str}" && '
260
+ f'echo -en "\\r{step_prefix}Kubernetes{empty_str}" && '
244
261
  'pip list | grep kubernetes > /dev/null 2>&1 || '
245
262
  'pip install "kubernetes>=20.0.0" > /dev/null 2>&1 &&'
246
263
  # Install k8s + skypilot dependencies
@@ -248,8 +265,8 @@ def _get_cloud_dependencies_installation_commands(
248
265
  '! command -v curl &> /dev/null || '
249
266
  '! command -v socat &> /dev/null || '
250
267
  '! command -v netcat &> /dev/null; '
251
- 'then apt update && apt install curl socat netcat -y '
252
- '&> /dev/null; '
268
+ 'then apt update &> /dev/null && '
269
+ 'apt install curl socat netcat -y &> /dev/null; '
253
270
  'fi" && '
254
271
  # Install kubectl
255
272
  '(command -v kubectl &>/dev/null || '
@@ -258,34 +275,55 @@ def _get_cloud_dependencies_installation_commands(
258
275
  '/bin/linux/amd64/kubectl" && '
259
276
  'sudo install -o root -g root -m 0755 '
260
277
  'kubectl /usr/local/bin/kubectl))')
278
+ setup_clouds.append(str(cloud))
261
279
  elif isinstance(cloud, clouds.Cudo):
280
+ step_prefix = prefix_str.replace('<step>',
281
+ str(len(setup_clouds) + 1))
262
282
  commands.append(
263
- f'echo -en "\\r{prefix_str}Cudo{empty_str}" && '
283
+ f'echo -en "\\r{step_prefix}Cudo{empty_str}" && '
264
284
  'pip list | grep cudo-compute > /dev/null 2>&1 || '
265
285
  'pip install "cudo-compute>=0.1.10" > /dev/null 2>&1 && '
266
286
  'wget https://download.cudo.org/compute/cudoctl-0.3.2-amd64.deb -O ~/cudoctl.deb > /dev/null 2>&1 && ' # pylint: disable=line-too-long
267
287
  'sudo dpkg -i ~/cudoctl.deb > /dev/null 2>&1')
288
+ setup_clouds.append(str(cloud))
268
289
  elif isinstance(cloud, clouds.RunPod):
269
- commands.append(f'echo -en "\\r{prefix_str}RunPod{empty_str}" && '
290
+ step_prefix = prefix_str.replace('<step>',
291
+ str(len(setup_clouds) + 1))
292
+ commands.append(f'echo -en "\\r{step_prefix}RunPod{empty_str}" && '
270
293
  'pip list | grep runpod > /dev/null 2>&1 || '
271
294
  'pip install "runpod>=1.5.1" > /dev/null 2>&1')
295
+ setup_clouds.append(str(cloud))
272
296
  if controller == Controllers.JOBS_CONTROLLER:
273
297
  if isinstance(cloud, clouds.IBM):
298
+ step_prefix = prefix_str.replace('<step>',
299
+ str(len(setup_clouds) + 1))
274
300
  commands.append(
275
- f'echo -en "\\r{prefix_str}IBM{empty_str}" '
301
+ f'echo -en "\\r{step_prefix}IBM{empty_str}" '
276
302
  '&& pip list | grep ibm-cloud-sdk-core > /dev/null 2>&1 || '
277
303
  'pip install ibm-cloud-sdk-core ibm-vpc '
278
304
  'ibm-platform-services ibm-cos-sdk > /dev/null 2>&1')
305
+ setup_clouds.append(str(cloud))
279
306
  elif isinstance(cloud, clouds.OCI):
307
+ step_prefix = prefix_str.replace('<step>',
308
+ str(len(setup_clouds) + 1))
280
309
  commands.append(f'echo -en "\\r{prefix_str}OCI{empty_str}" && '
281
310
  'pip list | grep oci > /dev/null 2>&1 || '
282
311
  'pip install oci > /dev/null 2>&1')
312
+ setup_clouds.append(str(cloud))
283
313
  if (cloudflare.NAME
284
314
  in storage_lib.get_cached_enabled_storage_clouds_or_refresh()):
285
- commands.append(f'echo -en "\\r{prefix_str}Cloudflare{empty_str}" && ' +
286
- aws_dependencies_installation)
287
- commands.append(f'echo -e "\\r{prefix_str}Done for {len(commands)} '
288
- 'clouds."')
315
+ step_prefix = prefix_str.replace('<step>', str(len(setup_clouds) + 1))
316
+ commands.append(
317
+ f'echo -en "\\r{step_prefix}Cloudflare{empty_str}" && ' +
318
+ aws_dependencies_installation)
319
+ setup_clouds.append(cloudflare.NAME)
320
+
321
+ finish_prefix = prefix_str.replace('[<step>/<total>] ', ' ')
322
+ commands.append(f'echo -e "\\r{finish_prefix}done.{empty_str}"')
323
+ commands = [
324
+ command.replace('<total>', str(len(setup_clouds)))
325
+ for command in commands
326
+ ]
289
327
  return commands
290
328
 
291
329
 
@@ -388,7 +426,7 @@ def shared_controller_vars_to_fill(
388
426
  'local_user_config_path': local_user_config_path,
389
427
  }
390
428
  env_vars: Dict[str, str] = {
391
- env.value: '1' for env in env_options.Options if env.get()
429
+ env.env_key: str(int(env.get())) for env in env_options.Options
392
430
  }
393
431
  env_vars.update({
394
432
  # Should not use $USER here, as that env var can be empty when
@@ -396,7 +434,9 @@ def shared_controller_vars_to_fill(
396
434
  constants.USER_ENV_VAR: getpass.getuser(),
397
435
  constants.USER_ID_ENV_VAR: common_utils.get_user_hash(),
398
436
  # Skip cloud identity check to avoid the overhead.
399
- env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.value: '1',
437
+ env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.env_key: '1',
438
+ # Disable minimize logging to get more details on the controller.
439
+ env_options.Options.MINIMIZE_LOGGING.env_key: '0',
400
440
  })
401
441
  if skypilot_config.loaded():
402
442
  # Only set the SKYPILOT_CONFIG env var if the user has a config file.
@@ -599,6 +639,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
599
639
  # ================================================================
600
640
  # Translate the workdir and local file mounts to cloud file mounts.
601
641
  # ================================================================
642
+
602
643
  run_id = common_utils.get_usage_run_id()[:8]
603
644
  original_file_mounts = task.file_mounts if task.file_mounts else {}
604
645
  original_storage_mounts = task.storage_mounts if task.storage_mounts else {}
@@ -618,8 +659,12 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
618
659
  elif has_local_source_paths_workdir:
619
660
  msg = 'workdir'
620
661
  if msg:
621
- logger.info(f'{colorama.Fore.YELLOW}Translating {msg} to SkyPilot '
622
- f'Storage...{colorama.Style.RESET_ALL}')
662
+ logger.info(
663
+ ux_utils.starting_message(f'Translating {msg} to '
664
+ 'SkyPilot Storage...'))
665
+ rich_utils.force_update_status(
666
+ ux_utils.spinner_message(
667
+ f'Translating {msg} to SkyPilot Storage...'))
623
668
 
624
669
  # Step 1: Translate the workdir to SkyPilot storage.
625
670
  new_storage_mounts = {}
@@ -643,8 +688,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
643
688
  })
644
689
  # Check of the existence of the workdir in file_mounts is done in
645
690
  # the task construction.
646
- logger.info(f'Workdir {workdir!r} will be synced to cloud storage '
647
- f'{bucket_name!r}.')
691
+ logger.info(f' {colorama.Style.DIM}Workdir: {workdir!r} '
692
+ f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}')
648
693
 
649
694
  # Step 2: Translate the local file mounts with folder in src to SkyPilot
650
695
  # storage.
@@ -668,9 +713,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
668
713
  'persistent': False,
669
714
  'mode': 'COPY',
670
715
  })
671
- logger.info(
672
- f'Folder in local file mount {src!r} will be synced to SkyPilot '
673
- f'storage {bucket_name}.')
716
+ logger.info(f' {colorama.Style.DIM}Folder : {src!r} '
717
+ f'-> storage: {bucket_name!r}.{colorama.Style.RESET_ALL}')
674
718
 
675
719
  # Step 3: Translate local file mounts with file in src to SkyPilot storage.
676
720
  # Hard link the files in src to a temporary directory, and upload folder.
@@ -703,10 +747,12 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
703
747
  f'destination {file_mount_remote_tmp_dir} '
704
748
  'being taken.')
705
749
  sources = list(src_to_file_id.keys())
706
- sources_str = '\n\t'.join(sources)
707
- logger.info('Source files in file_mounts will be synced to '
708
- f'cloud storage {file_bucket_name}:'
709
- f'\n\t{sources_str}')
750
+ sources_str = '\n '.join(sources)
751
+ logger.info(f' {colorama.Style.DIM}Files (listed below) '
752
+ f' -> storage: {file_bucket_name}:'
753
+ f'\n {sources_str}{colorama.Style.RESET_ALL}')
754
+ rich_utils.force_update_status(
755
+ ux_utils.spinner_message('Uploading translated local files/folders'))
710
756
  task.update_storage_mounts(new_storage_mounts)
711
757
 
712
758
  # Step 4: Upload storage from sources
@@ -716,8 +762,9 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
716
762
  if task.storage_mounts:
717
763
  # There may be existing (non-translated) storage mounts, so log this
718
764
  # whenever task.storage_mounts is non-empty.
719
- logger.info(f'{colorama.Fore.YELLOW}Uploading sources to cloud storage.'
720
- f'{colorama.Style.RESET_ALL} See: sky storage ls')
765
+ rich_utils.force_update_status(
766
+ ux_utils.spinner_message('Uploading local sources to storage[/] '
767
+ '[dim]View storages: sky storage ls'))
721
768
  try:
722
769
  task.sync_storage_mounts()
723
770
  except ValueError as e:
@@ -800,3 +847,5 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
800
847
  })
801
848
  updated_mount_storages[storage_path] = new_storage
802
849
  task.update_storage_mounts(updated_mount_storages)
850
+ if msg:
851
+ logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))
sky/utils/env_options.py CHANGED
@@ -5,17 +5,32 @@ import os
5
5
 
6
6
  class Options(enum.Enum):
7
7
  """Environment variables for SkyPilot."""
8
- IS_DEVELOPER = 'SKYPILOT_DEV'
9
- SHOW_DEBUG_INFO = 'SKYPILOT_DEBUG'
10
- DISABLE_LOGGING = 'SKYPILOT_DISABLE_USAGE_COLLECTION'
11
- MINIMIZE_LOGGING = 'SKYPILOT_MINIMIZE_LOGGING'
8
+
9
+ # (env var name, default value)
10
+ IS_DEVELOPER = ('SKYPILOT_DEV', False)
11
+ SHOW_DEBUG_INFO = ('SKYPILOT_DEBUG', False)
12
+ DISABLE_LOGGING = ('SKYPILOT_DISABLE_USAGE_COLLECTION', False)
13
+ MINIMIZE_LOGGING = ('SKYPILOT_MINIMIZE_LOGGING', True)
12
14
  # Internal: this is used to skip the cloud user identity check, which is
13
15
  # used to protect cluster operations in a multi-identity scenario.
14
16
  # Currently, this is only used in the job and serve controller, as there
15
17
  # will not be multiple identities, and skipping the check can increase
16
18
  # robustness.
17
- SKIP_CLOUD_IDENTITY_CHECK = 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK'
19
+ SKIP_CLOUD_IDENTITY_CHECK = ('SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK', False)
20
+
21
+ def __init__(self, env_var: str, default: bool) -> None:
22
+ self.env_var = env_var
23
+ self.default = default
18
24
 
19
- def get(self):
25
+ def __repr__(self) -> str:
26
+ return self.env_var
27
+
28
+ def get(self) -> bool:
20
29
  """Check if an environment variable is set to True."""
21
- return os.getenv(self.value, 'False').lower() in ('true', '1')
30
+ return os.getenv(self.env_var,
31
+ str(self.default)).lower() in ('true', '1')
32
+
33
+ @property
34
+ def env_key(self) -> str:
35
+ """The environment variable key name."""
36
+ return self.value[0]