skypilot-nightly 1.0.0.dev20241014__py3-none-any.whl → 1.0.0.dev20241016__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '340f38404fe5d3ebe35ea430a67cb3377241d1f3'
8
+ _SKYPILOT_COMMIT_SHA = '53380e26f01452559012d57b333b17f40dd8a4d1'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241014'
38
+ __version__ = '1.0.0.dev20241016'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -56,7 +56,7 @@ from sky.utils import timeline
56
56
  from sky.utils import ux_utils
57
57
 
58
58
  if typing.TYPE_CHECKING:
59
- from sky import resources
59
+ from sky import resources as resources_lib
60
60
  from sky import task as task_lib
61
61
  from sky.backends import cloud_vm_ray_backend
62
62
  from sky.backends import local_docker_backend
@@ -751,7 +751,7 @@ def _replace_yaml_dicts(
751
751
  # TODO: too many things happening here - leaky abstraction. Refactor.
752
752
  @timeline.event
753
753
  def write_cluster_config(
754
- to_provision: 'resources.Resources',
754
+ to_provision: 'resources_lib.Resources',
755
755
  num_nodes: int,
756
756
  cluster_config_template: str,
757
757
  cluster_name: str,
@@ -2844,9 +2844,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2844
2844
  time.sleep(gap_seconds)
2845
2845
  continue
2846
2846
  logger.error(
2847
- f'{colorama.Fore.RED}⨯{colorama.Style.RESET_ALL} '
2848
- 'Failed to provision resources. '
2849
- f'{ux_utils.log_path_hint(log_path)}')
2847
+ ux_utils.error_message(
2848
+ 'Failed to provision resources. '
2849
+ f'{ux_utils.log_path_hint(log_path)}'))
2850
2850
  error_message += (
2851
2851
  '\nTo keep retrying until the cluster is up, use '
2852
2852
  'the `--retry-until-up` flag.')
sky/cli.py CHANGED
@@ -1464,54 +1464,8 @@ def _status_kubernetes(show_all: bool):
1464
1464
  Args:
1465
1465
  show_all (bool): Show all job information (e.g., start time, failures).
1466
1466
  """
1467
- context = kubernetes_utils.get_current_kube_config_context_name()
1468
- try:
1469
- pods = kubernetes_utils.get_skypilot_pods(context)
1470
- except exceptions.ResourcesUnavailableError as e:
1471
- with ux_utils.print_exception_no_traceback():
1472
- raise ValueError('Failed to get SkyPilot pods from '
1473
- f'Kubernetes: {str(e)}') from e
1474
- all_clusters, jobs_controllers, serve_controllers = (
1475
- status_utils.process_skypilot_pods(pods, context))
1476
- all_jobs = []
1477
- with rich_utils.safe_status(
1478
- '[bold cyan]Checking in-progress managed jobs[/]') as spinner:
1479
- for i, (_, job_controller_info) in enumerate(jobs_controllers.items()):
1480
- user = job_controller_info['user']
1481
- pod = job_controller_info['pods'][0]
1482
- status_message = ('[bold cyan]Checking managed jobs controller')
1483
- if len(jobs_controllers) > 1:
1484
- status_message += f's ({i+1}/{len(jobs_controllers)})'
1485
- spinner.update(f'{status_message}[/]')
1486
- try:
1487
- job_list = managed_jobs.queue_from_kubernetes_pod(
1488
- pod.metadata.name)
1489
- except RuntimeError as e:
1490
- logger.warning('Failed to get managed jobs from controller '
1491
- f'{pod.metadata.name}: {str(e)}')
1492
- job_list = []
1493
- # Add user field to jobs
1494
- for job in job_list:
1495
- job['user'] = user
1496
- all_jobs.extend(job_list)
1497
- # Reconcile cluster state between managed jobs and clusters:
1498
- # To maintain a clear separation between regular SkyPilot clusters
1499
- # and those from managed jobs, we need to exclude the latter from
1500
- # the main cluster list.
1501
- # We do this by reconstructing managed job cluster names from each
1502
- # job's name and ID. We then use this set to filter out managed
1503
- # clusters from the main cluster list. This is necessary because there
1504
- # are no identifiers distinguishing clusters from managed jobs from
1505
- # regular clusters.
1506
- managed_job_cluster_names = set()
1507
- for job in all_jobs:
1508
- # Managed job cluster name is <job_name>-<job_id>
1509
- managed_cluster_name = f'{job["job_name"]}-{job["job_id"]}'
1510
- managed_job_cluster_names.add(managed_cluster_name)
1511
- unmanaged_clusters = [
1512
- c for c in all_clusters
1513
- if c['cluster_name'] not in managed_job_cluster_names
1514
- ]
1467
+ all_clusters, unmanaged_clusters, all_jobs, context = (
1468
+ core.status_kubernetes())
1515
1469
  click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1516
1470
  f'Kubernetes cluster state (context: {context})'
1517
1471
  f'{colorama.Style.RESET_ALL}')
@@ -1523,7 +1477,7 @@ def _status_kubernetes(show_all: bool):
1523
1477
  f'{colorama.Style.RESET_ALL}')
1524
1478
  msg = managed_jobs.format_job_table(all_jobs, show_all=show_all)
1525
1479
  click.echo(msg)
1526
- if serve_controllers:
1480
+ if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
1527
1481
  # TODO: Parse serve controllers and show services separately.
1528
1482
  # Currently we show a hint that services are shown as clusters.
1529
1483
  click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
sky/clouds/gcp.py CHANGED
@@ -94,6 +94,12 @@ _IMAGE_NOT_FOUND_UX_MESSAGE = (
94
94
  f'\nTo query common AI images: {colorama.Style.BRIGHT}gcloud compute images list --project deeplearning-platform-release | less{colorama.Style.RESET_ALL}'
95
95
  )
96
96
 
97
+ # Image ID tags
98
+ _DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-2204'
99
+ # For GPU-related package version, see sky/clouds/service_catalog/images/provisioners/cuda.sh
100
+ _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-2204'
101
+ _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-debian-10'
102
+
97
103
 
98
104
  def _run_output(cmd):
99
105
  proc = subprocess.run(cmd,
@@ -422,7 +428,7 @@ class GCP(clouds.Cloud):
422
428
  # --no-standard-images
423
429
  # We use the debian image, as the ubuntu image has some connectivity
424
430
  # issue when first booted.
425
- image_id = 'skypilot:cpu-debian-11'
431
+ image_id = _DEFAULT_CPU_IMAGE_ID
426
432
 
427
433
  def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
428
434
  if (r.disk_tier is not None and
@@ -487,10 +493,10 @@ class GCP(clouds.Cloud):
487
493
  # Though the image is called cu113, it actually has later
488
494
  # versions of CUDA as noted below.
489
495
  # CUDA driver version 470.57.02, CUDA Library 11.4
490
- image_id = 'skypilot:k80-debian-10'
496
+ image_id = _DEFAULT_GPU_K80_IMAGE_ID
491
497
  else:
492
498
  # CUDA driver version 535.86.10, CUDA Library 12.2
493
- image_id = 'skypilot:gpu-debian-11'
499
+ image_id = _DEFAULT_GPU_IMAGE_ID
494
500
 
495
501
  if (resources.image_id is not None and
496
502
  resources.extract_docker_image() is None):
sky/core.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """SDK functions for cluster/job management."""
2
2
  import getpass
3
3
  import typing
4
- from typing import Any, Dict, List, Optional, Union
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
5
 
6
6
  import colorama
7
7
 
@@ -11,10 +11,12 @@ from sky import dag
11
11
  from sky import data
12
12
  from sky import exceptions
13
13
  from sky import global_user_state
14
+ from sky import jobs as managed_jobs
14
15
  from sky import sky_logging
15
16
  from sky import status_lib
16
17
  from sky import task
17
18
  from sky.backends import backend_utils
19
+ from sky.provision.kubernetes import utils as kubernetes_utils
18
20
  from sky.skylet import constants
19
21
  from sky.skylet import job_lib
20
22
  from sky.usage import usage_lib
@@ -111,6 +113,79 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None,
111
113
  cluster_names=cluster_names)
112
114
 
113
115
 
116
+ def status_kubernetes(
117
+ ) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfo'],
118
+ List['kubernetes_utils.KubernetesSkyPilotClusterInfo'], List[Dict[
119
+ str, Any]], Optional[str]]:
120
+ """Get all SkyPilot clusters and jobs in the Kubernetes cluster.
121
+
122
+ Managed jobs and services are also included in the clusters returned.
123
+ The caller must parse the controllers to identify which clusters are run
124
+ as managed jobs or services.
125
+ all_clusters, unmanaged_clusters, all_jobs, context
126
+ Returns:
127
+ A tuple containing:
128
+ - all_clusters: List of KubernetesSkyPilotClusterInfo with info for
129
+ all clusters, including managed jobs, services and controllers.
130
+ - unmanaged_clusters: List of KubernetesSkyPilotClusterInfo with info
131
+ for all clusters excluding managed jobs and services. Controllers
132
+ are included.
133
+ - all_jobs: List of managed jobs from all controllers. Each entry is a
134
+ dictionary job info, see jobs.queue_from_kubernetes_pod for details.
135
+ - context: Kubernetes context used to fetch the cluster information.
136
+ """
137
+ context = kubernetes_utils.get_current_kube_config_context_name()
138
+ try:
139
+ pods = kubernetes_utils.get_skypilot_pods(context)
140
+ except exceptions.ResourcesUnavailableError as e:
141
+ with ux_utils.print_exception_no_traceback():
142
+ raise ValueError('Failed to get SkyPilot pods from '
143
+ f'Kubernetes: {str(e)}') from e
144
+ all_clusters, jobs_controllers, _ = (kubernetes_utils.process_skypilot_pods(
145
+ pods, context))
146
+ all_jobs = []
147
+ with rich_utils.safe_status(
148
+ ux_utils.spinner_message(
149
+ '[bold cyan]Checking in-progress managed jobs[/]')) as spinner:
150
+ for i, job_controller_info in enumerate(jobs_controllers):
151
+ user = job_controller_info.user
152
+ pod = job_controller_info.pods[0]
153
+ status_message = '[bold cyan]Checking managed jobs controller'
154
+ if len(jobs_controllers) > 1:
155
+ status_message += f's ({i + 1}/{len(jobs_controllers)})'
156
+ spinner.update(f'{status_message}[/]')
157
+ try:
158
+ job_list = managed_jobs.queue_from_kubernetes_pod(
159
+ pod.metadata.name)
160
+ except RuntimeError as e:
161
+ logger.warning('Failed to get managed jobs from controller '
162
+ f'{pod.metadata.name}: {str(e)}')
163
+ job_list = []
164
+ # Add user field to jobs
165
+ for job in job_list:
166
+ job['user'] = user
167
+ all_jobs.extend(job_list)
168
+ # Reconcile cluster state between managed jobs and clusters:
169
+ # To maintain a clear separation between regular SkyPilot clusters
170
+ # and those from managed jobs, we need to exclude the latter from
171
+ # the main cluster list.
172
+ # We do this by reconstructing managed job cluster names from each
173
+ # job's name and ID. We then use this set to filter out managed
174
+ # clusters from the main cluster list. This is necessary because there
175
+ # are no identifiers distinguishing clusters from managed jobs from
176
+ # regular clusters.
177
+ managed_job_cluster_names = set()
178
+ for job in all_jobs:
179
+ # Managed job cluster name is <job_name>-<job_id>
180
+ managed_cluster_name = f'{job["job_name"]}-{job["job_id"]}'
181
+ managed_job_cluster_names.add(managed_cluster_name)
182
+ unmanaged_clusters = [
183
+ c for c in all_clusters
184
+ if c.cluster_name not in managed_job_cluster_names
185
+ ]
186
+ return all_clusters, unmanaged_clusters, all_jobs, context
187
+
188
+
114
189
  def endpoints(cluster: str,
115
190
  port: Optional[Union[int, str]] = None) -> Dict[int, str]:
116
191
  """Gets the endpoint for a given cluster and port number (endpoint).
@@ -15,9 +15,11 @@ import jinja2
15
15
  import yaml
16
16
 
17
17
  import sky
18
+ from sky import clouds
18
19
  from sky import exceptions
19
20
  from sky import sky_logging
20
21
  from sky import skypilot_config
22
+ from sky import status_lib
21
23
  from sky.adaptors import kubernetes
22
24
  from sky.provision import constants as provision_constants
23
25
  from sky.provision.kubernetes import network_utils
@@ -30,6 +32,7 @@ from sky.utils import ux_utils
30
32
 
31
33
  if typing.TYPE_CHECKING:
32
34
  from sky import backends
35
+ from sky import resources as resources_lib
33
36
 
34
37
  # TODO(romilb): Move constants to constants.py
35
38
  DEFAULT_NAMESPACE = 'default'
@@ -2023,3 +2026,113 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
2023
2026
  'kubectl get pods --selector=skypilot-cluster --all-namespaces'
2024
2027
  ) from None
2025
2028
  return pods
2029
+
2030
+
2031
+ @dataclasses.dataclass
2032
+ class KubernetesSkyPilotClusterInfo:
2033
+ cluster_name_on_cloud: str
2034
+ cluster_name: str
2035
+ user: str
2036
+ status: status_lib.ClusterStatus
2037
+ pods: List[Any]
2038
+ launched_at: float
2039
+ resources: 'resources_lib.Resources'
2040
+ resources_str: str
2041
+
2042
+
2043
+ def process_skypilot_pods(
2044
+ pods: List[Any],
2045
+ context: Optional[str] = None
2046
+ ) -> Tuple[List[KubernetesSkyPilotClusterInfo],
2047
+ List[KubernetesSkyPilotClusterInfo],
2048
+ List[KubernetesSkyPilotClusterInfo]]:
2049
+ """Process SkyPilot pods on k8s to extract cluster and controller info.
2050
+
2051
+ Args:
2052
+ pods: List of Kubernetes pod objects.
2053
+ context: Kubernetes context name, used to detect GPU label formatter.
2054
+
2055
+ Returns:
2056
+ A tuple containing:
2057
+ - List of KubernetesSkyPilotClusterInfo with all cluster info.
2058
+ - List of KubernetesSkyPilotClusterInfo with job controller info.
2059
+ - List of KubernetesSkyPilotClusterInfo with serve controller info.
2060
+ """
2061
+ # pylint: disable=import-outside-toplevel
2062
+ from sky import resources as resources_lib
2063
+ clusters: Dict[str, KubernetesSkyPilotClusterInfo] = {}
2064
+ jobs_controllers: List[KubernetesSkyPilotClusterInfo] = []
2065
+ serve_controllers: List[KubernetesSkyPilotClusterInfo] = []
2066
+
2067
+ for pod in pods:
2068
+ cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
2069
+ cluster_name = cluster_name_on_cloud.rsplit(
2070
+ '-', 1
2071
+ )[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
2072
+ if cluster_name_on_cloud not in clusters:
2073
+ # Parse the start time for the cluster
2074
+ start_time = pod.status.start_time
2075
+ if start_time is not None:
2076
+ start_time = pod.status.start_time.timestamp()
2077
+
2078
+ # Parse resources
2079
+ cpu_request = parse_cpu_or_gpu_resource(
2080
+ pod.spec.containers[0].resources.requests.get('cpu', '0'))
2081
+ memory_request = parse_memory_resource(
2082
+ pod.spec.containers[0].resources.requests.get('memory', '0'),
2083
+ unit='G')
2084
+ gpu_count = parse_cpu_or_gpu_resource(
2085
+ pod.spec.containers[0].resources.requests.get(
2086
+ 'nvidia.com/gpu', '0'))
2087
+ gpu_name = None
2088
+ if gpu_count > 0:
2089
+ label_formatter, _ = (detect_gpu_label_formatter(context))
2090
+ assert label_formatter is not None, (
2091
+ 'GPU label formatter cannot be None if there are pods '
2092
+ f'requesting GPUs: {pod.metadata.name}')
2093
+ gpu_label = label_formatter.get_label_key()
2094
+ # Get GPU name from pod node selector
2095
+ if pod.spec.node_selector is not None:
2096
+ gpu_name = label_formatter.get_accelerator_from_label_value(
2097
+ pod.spec.node_selector.get(gpu_label))
2098
+
2099
+ resources = resources_lib.Resources(
2100
+ cloud=clouds.Kubernetes(),
2101
+ cpus=int(cpu_request),
2102
+ memory=int(memory_request),
2103
+ accelerators=(f'{gpu_name}:{gpu_count}'
2104
+ if gpu_count > 0 else None))
2105
+ if pod.status.phase == 'Pending':
2106
+ # If pod is pending, do not show it in the status
2107
+ continue
2108
+
2109
+ cluster_info = KubernetesSkyPilotClusterInfo(
2110
+ cluster_name_on_cloud=cluster_name_on_cloud,
2111
+ cluster_name=cluster_name,
2112
+ user=pod.metadata.labels.get('skypilot-user'),
2113
+ status=status_lib.ClusterStatus.UP,
2114
+ pods=[],
2115
+ launched_at=start_time,
2116
+ resources=resources,
2117
+ resources_str='')
2118
+ clusters[cluster_name_on_cloud] = cluster_info
2119
+ # Check if cluster name is name of a controller
2120
+ # Can't use controller_utils.Controllers.from_name(cluster_name)
2121
+ # because hash is different across users
2122
+ if 'sky-jobs-controller' in cluster_name_on_cloud:
2123
+ jobs_controllers.append(cluster_info)
2124
+ elif 'sky-serve-controller' in cluster_name_on_cloud:
2125
+ serve_controllers.append(cluster_info)
2126
+ else:
2127
+ # Update start_time if this pod started earlier
2128
+ pod_start_time = pod.status.start_time
2129
+ if pod_start_time is not None:
2130
+ pod_start_time = pod_start_time.timestamp()
2131
+ if pod_start_time < clusters[cluster_name_on_cloud].launched_at:
2132
+ clusters[cluster_name_on_cloud].launched_at = pod_start_time
2133
+ clusters[cluster_name_on_cloud].pods.append(pod)
2134
+ # Update resources_str in clusters:
2135
+ for cluster in clusters.values():
2136
+ num_pods = len(cluster.pods)
2137
+ cluster.resources_str = f'{num_pods}x {cluster.resources}'
2138
+ return list(clusters.values()), jobs_controllers, serve_controllers
@@ -571,7 +571,10 @@ def post_provision_runtime_setup(
571
571
  provision_record=provision_record,
572
572
  custom_resource=custom_resource)
573
573
  except Exception: # pylint: disable=broad-except
574
- logger.error('*** Failed setting up cluster. ***')
574
+ logger.error(
575
+ ux_utils.error_message(
576
+ 'Failed to set up SkyPilot runtime on cluster.',
577
+ provision_logging.config.log_path))
575
578
  logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
576
579
  with ux_utils.print_exception_no_traceback():
577
580
  raise
sky/skylet/constants.py CHANGED
@@ -155,8 +155,8 @@ CONDA_INSTALLATION_COMMANDS = (
155
155
  # We use --system-site-packages to reuse the system site packages to avoid
156
156
  # the overhead of installing the same packages in the new environment.
157
157
  f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
158
- f'{{ {SKY_PYTHON_CMD} -m venv {SKY_REMOTE_PYTHON_ENV} --system-site-packages && '
159
- f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE}; }};'
158
+ f'{SKY_PYTHON_CMD} -m venv {SKY_REMOTE_PYTHON_ENV} --system-site-packages;'
159
+ f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE};'
160
160
  )
161
161
 
162
162
  _sky_version = str(version.parse(sky.__version__))
@@ -1,19 +1,20 @@
1
1
  """Utilities for sky status."""
2
- from typing import Any, Callable, Dict, List, Optional, Tuple
2
+ import typing
3
+ from typing import Any, Callable, Dict, List, Optional
3
4
 
4
5
  import click
5
6
  import colorama
6
7
 
7
8
  from sky import backends
8
- from sky import clouds as sky_clouds
9
- from sky import resources as resources_lib
10
9
  from sky import status_lib
11
- from sky.provision.kubernetes import utils as kubernetes_utils
12
10
  from sky.skylet import constants
13
11
  from sky.utils import common_utils
14
12
  from sky.utils import log_utils
15
13
  from sky.utils import resources_utils
16
14
 
15
+ if typing.TYPE_CHECKING:
16
+ from sky.provision.kubernetes import utils as kubernetes_utils
17
+
17
18
  COMMAND_TRUNC_LENGTH = 25
18
19
  NUM_COST_REPORT_LINES = 5
19
20
 
@@ -303,19 +304,19 @@ def _get_estimated_cost_for_cost_report(
303
304
  return f'$ {cost:.2f}'
304
305
 
305
306
 
306
- def show_kubernetes_cluster_status_table(clusters: List[Any],
307
- show_all: bool) -> None:
307
+ def show_kubernetes_cluster_status_table(
308
+ clusters: List['kubernetes_utils.KubernetesSkyPilotClusterInfo'],
309
+ show_all: bool) -> None:
308
310
  """Compute cluster table values and display for Kubernetes clusters."""
309
311
  status_columns = [
310
- StatusColumn('USER', lambda c: c['user']),
311
- StatusColumn('NAME', lambda c: c['cluster_name']),
312
- StatusColumn(
313
- 'LAUNCHED',
314
- lambda c: log_utils.readable_time_duration(c['launched_at'])),
312
+ StatusColumn('USER', lambda c: c.user),
313
+ StatusColumn('NAME', lambda c: c.cluster_name),
314
+ StatusColumn('LAUNCHED',
315
+ lambda c: log_utils.readable_time_duration(c.launched_at)),
315
316
  StatusColumn('RESOURCES',
316
- lambda c: c['resources_str'],
317
+ lambda c: c.resources_str,
317
318
  trunc_length=70 if not show_all else 0),
318
- StatusColumn('STATUS', lambda c: c['status'].colored_str()),
319
+ StatusColumn('STATUS', lambda c: c.status.colored_str()),
319
320
  # TODO(romilb): We should consider adding POD_NAME field here when --all
320
321
  # is passed to help users fetch pod name programmatically.
321
322
  ]
@@ -326,8 +327,7 @@ def show_kubernetes_cluster_status_table(clusters: List[Any],
326
327
  cluster_table = log_utils.create_table(columns)
327
328
 
328
329
  # Sort table by user, then by cluster name
329
- sorted_clusters = sorted(clusters,
330
- key=lambda c: (c['user'], c['cluster_name']))
330
+ sorted_clusters = sorted(clusters, key=lambda c: (c.user, c.cluster_name))
331
331
 
332
332
  for cluster in sorted_clusters:
333
333
  row = []
@@ -344,122 +344,3 @@ def show_kubernetes_cluster_status_table(clusters: List[Any],
344
344
  else:
345
345
  click.echo('No SkyPilot resources found in the '
346
346
  'active Kubernetes context.')
347
-
348
-
349
- def process_skypilot_pods(
350
- pods: List[Any],
351
- context: Optional[str] = None
352
- ) -> Tuple[List[Dict[Any, Any]], Dict[str, Any], Dict[str, Any]]:
353
- """Process SkyPilot pods on k8s to extract cluster and controller info.
354
-
355
- Args:
356
- pods: List of Kubernetes pod objects.
357
- context: Kubernetes context name, used to detect GPU label formatter.
358
-
359
- Returns:
360
- A tuple containing:
361
- - List of dictionaries with cluster information.
362
- - Dictionary of job controller information.
363
- - Dictionary of serve controller information.
364
-
365
- Each dictionary contains the following keys:
366
- 'cluster_name_on_cloud': The cluster_name_on_cloud used by SkyPilot
367
- 'cluster_name': The cluster name without the user hash
368
- 'user': The user who created the cluster. Fetched from pod label
369
- 'status': The cluster status (assumed UP if pod exists)
370
- 'pods': List of pod objects in the cluster
371
- 'launched_at': Timestamp of when the cluster was launched
372
- 'resources': sky.Resources object for the cluster
373
- """
374
- clusters: Dict[str, Dict] = {}
375
- jobs_controllers: Dict[str, Dict] = {}
376
- serve_controllers: Dict[str, Dict] = {}
377
-
378
- for pod in pods:
379
- cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
380
- cluster_name = cluster_name_on_cloud.rsplit(
381
- '-', 1
382
- )[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
383
-
384
- # Check if cluster name is name of a controller
385
- # Can't use controller_utils.Controllers.from_name(cluster_name)
386
- # because hash is different across users
387
- if 'controller' in cluster_name_on_cloud:
388
- start_time = pod.status.start_time.timestamp()
389
- controller_info = {
390
- 'cluster_name_on_cloud': cluster_name_on_cloud,
391
- 'cluster_name': cluster_name,
392
- 'user': pod.metadata.labels.get('skypilot-user'),
393
- 'status': status_lib.ClusterStatus.UP,
394
- # Assuming UP if pod exists
395
- 'pods': [pod],
396
- 'launched_at': start_time
397
- }
398
- if 'sky-jobs-controller' in cluster_name_on_cloud:
399
- jobs_controllers[cluster_name_on_cloud] = controller_info
400
- elif 'sky-serve-controller' in cluster_name_on_cloud:
401
- serve_controllers[cluster_name_on_cloud] = controller_info
402
-
403
- if cluster_name_on_cloud not in clusters:
404
- # Parse the start time for the cluster
405
- start_time = pod.status.start_time
406
- if start_time is not None:
407
- start_time = pod.status.start_time.timestamp()
408
-
409
- # Parse resources
410
- cpu_request = kubernetes_utils.parse_cpu_or_gpu_resource(
411
- pod.spec.containers[0].resources.requests.get('cpu', '0'))
412
- memory_request = kubernetes_utils.parse_memory_resource(
413
- pod.spec.containers[0].resources.requests.get('memory', '0'),
414
- unit='G')
415
- gpu_count = kubernetes_utils.parse_cpu_or_gpu_resource(
416
- pod.spec.containers[0].resources.requests.get(
417
- 'nvidia.com/gpu', '0'))
418
- if gpu_count > 0:
419
- label_formatter, _ = (
420
- kubernetes_utils.detect_gpu_label_formatter(context))
421
- assert label_formatter is not None, (
422
- 'GPU label formatter cannot be None if there are pods '
423
- f'requesting GPUs: {pod.metadata.name}')
424
- gpu_label = label_formatter.get_label_key()
425
- # Get GPU name from pod node selector
426
- if pod.spec.node_selector is not None:
427
- gpu_name = label_formatter.get_accelerator_from_label_value(
428
- pod.spec.node_selector.get(gpu_label))
429
-
430
- resources = resources_lib.Resources(
431
- cloud=sky_clouds.Kubernetes(),
432
- cpus=int(cpu_request),
433
- memory=int(memory_request),
434
- accelerators=(f'{gpu_name}:{gpu_count}'
435
- if gpu_count > 0 else None))
436
- if pod.status.phase == 'Pending':
437
- # If pod is pending, do not show it in the status
438
- continue
439
-
440
- clusters[cluster_name_on_cloud] = {
441
- 'cluster_name_on_cloud': cluster_name_on_cloud,
442
- 'cluster_name': cluster_name,
443
- 'user': pod.metadata.labels.get('skypilot-user'),
444
- 'status': status_lib.ClusterStatus.UP,
445
- 'pods': [],
446
- 'launched_at': start_time,
447
- 'resources': resources,
448
- }
449
- else:
450
- # Update start_time if this pod started earlier
451
- pod_start_time = pod.status.start_time
452
- if pod_start_time is not None:
453
- pod_start_time = pod_start_time.timestamp()
454
- if pod_start_time < clusters[cluster_name_on_cloud][
455
- 'launched_at']:
456
- clusters[cluster_name_on_cloud][
457
- 'launched_at'] = pod_start_time
458
- clusters[cluster_name_on_cloud]['pods'].append(pod)
459
- # Update resources_str in clusters:
460
- for cluster_name, cluster in clusters.items():
461
- resources = cluster['resources']
462
- num_pods = len(cluster['pods'])
463
- resources_str = f'{num_pods}x {resources}'
464
- cluster['resources_str'] = resources_str
465
- return list(clusters.values()), jobs_controllers, serve_controllers
sky/utils/ux_utils.py CHANGED
@@ -121,11 +121,6 @@ class RedirectOutputForProcess:
121
121
  raise
122
122
 
123
123
 
124
- def starting_message(message: str) -> str:
125
- """Gets the starting message for the given message."""
126
- return f'⚙︎ {message}'
127
-
128
-
129
124
  def log_path_hint(log_path: Union[str, 'pathlib.Path']) -> str:
130
125
  """Gets the log path hint for the given log path."""
131
126
  log_path = str(log_path)
@@ -135,21 +130,50 @@ def log_path_hint(log_path: Union[str, 'pathlib.Path']) -> str:
135
130
  return _LOG_PATH_HINT.format(log_path=log_path)
136
131
 
137
132
 
133
+ def starting_message(message: str) -> str:
134
+ """Gets the starting message for the given message."""
135
+ # We have to reset the color before the message, because sometimes if a
136
+ # previous spinner with dimmed color overflows in a narrow terminal, the
137
+ # color might be messed up.
138
+ return f'{colorama.Style.RESET_ALL}⚙︎ {message}'
139
+
140
+
138
141
  def finishing_message(
139
142
  message: str,
140
143
  log_path: Optional[Union[str, 'pathlib.Path']] = None) -> str:
141
144
  """Gets the finishing message for the given message."""
142
- success_prefix = (f'{colorama.Fore.GREEN}✓ {message}'
143
- f'{colorama.Style.RESET_ALL}')
145
+ # We have to reset the color before the message, because sometimes if a
146
+ # previous spinner with dimmed color overflows in a narrow terminal, the
147
+ # color might be messed up.
148
+ success_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.GREEN}✓ '
149
+ f'{message}{colorama.Style.RESET_ALL}')
144
150
  if log_path is None:
145
151
  return success_prefix
146
152
  path_hint = log_path_hint(log_path)
147
153
  return f'{success_prefix} {path_hint}'
148
154
 
149
155
 
156
+ def error_message(message: str,
157
+ log_path: Optional[Union[str, 'pathlib.Path']] = None) -> str:
158
+ """Gets the error message for the given message."""
159
+ # We have to reset the color before the message, because sometimes if a
160
+ # previous spinner with dimmed color overflows in a narrow terminal, the
161
+ # color might be messed up.
162
+ error_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.RED}⨯'
163
+ f'{colorama.Style.RESET_ALL} {message}')
164
+ if log_path is None:
165
+ return error_prefix
166
+ path_hint = log_path_hint(log_path)
167
+ return f'{error_prefix} {path_hint}'
168
+
169
+
150
170
  def retry_message(message: str) -> str:
151
171
  """Gets the retry message for the given message."""
152
- return f'{colorama.Fore.YELLOW}↺{colorama.Style.RESET_ALL} {message}'
172
+ # We have to reset the color before the message, because sometimes if a
173
+ # previous spinner with dimmed color overflows in a narrow terminal, the
174
+ # color might be messed up.
175
+ return (f'{colorama.Style.RESET_ALL}{colorama.Fore.YELLOW}↺'
176
+ f'{colorama.Style.RESET_ALL} {message}')
153
177
 
154
178
 
155
179
  def spinner_message(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241014
3
+ Version: 1.0.0.dev20241016
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,10 +1,10 @@
1
- sky/__init__.py,sha256=VNwdB81dfCeF4uRD4BuPNbQ3MmE9-P1PX9hLxEzmWmw,5854
1
+ sky/__init__.py,sha256=19EG_Nr4EJcbkyLvfF_ZmWhAbfEysS498RvephjOslM,5854
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=TfKkVnmRIetATSEVQFp-rOOIRGqVig2i8faSQQt_ixA,20974
4
4
  sky/check.py,sha256=jLMIIJrseaZj1_o5WkbaD9XdyXIlCaT6pyAaIFdhdmA,9079
5
- sky/cli.py,sha256=eC-uC2_OhuDephCdejr70VeB1bo1KGwHC8et5VR_lto,210828
5
+ sky/cli.py,sha256=PJR6W92twf89j17OWLQJ9RawdazJcGslfW2L_fLB2PM,208545
6
6
  sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
- sky/core.py,sha256=I7ZxjrJmrM-zAWVnmUyz0kzS9fjKaaLehDdWMm3H0x8,34443
7
+ sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
8
8
  sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
9
9
  sky/exceptions.py,sha256=D7WARzYRt4dGjXo6gI-gzkoodZbKF1D-qncm_DbHB28,8846
10
10
  sky/execution.py,sha256=CbrKMgfc2JgLqZqwPvmYKxbWAQKYqHpOLpUEOb-k2m0,24679
@@ -30,8 +30,8 @@ sky/adaptors/runpod.py,sha256=4Nt_BfZhJAKQNA3wO8cxvvNI8x4NsDGHu_4EhRDlGYQ,225
30
30
  sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
31
31
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
32
32
  sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
33
- sky/backends/backend_utils.py,sha256=Kp3HRqjrisnDYzLURYSbtyZ7WGMKHSVfMSwRj5_aOnc,126770
34
- sky/backends/cloud_vm_ray_backend.py,sha256=otDjYA4FtnlFGU_xNaxhBM5wFQ8fuF4kPeYQmf1Me3E,236901
33
+ sky/backends/backend_utils.py,sha256=u9P7Fd3DB9LaOq51fK7kwKpxtgFGGWmgULY6GoLSUPM,126791
34
+ sky/backends/cloud_vm_ray_backend.py,sha256=9mCLLRUD-x3ksiiPbhrMDsZWIPNU9cVSQwwpmxSia7k,236881
35
35
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
36
36
  sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
37
37
  sky/backends/wheel_utils.py,sha256=3QS4T_Ydvo4DbYhogtyADyNBEf04I6jUCL71M285shQ,7963
@@ -46,7 +46,7 @@ sky/clouds/cloud.py,sha256=PPk-Cbf1YbJT8bswcQLtPBtko02OWrRGJKkLzDpytTI,34858
46
46
  sky/clouds/cloud_registry.py,sha256=4yQMv-iBSgyN5aNL4Qxbn0JVE-dkVoEUIgj7S1z9S_Q,955
47
47
  sky/clouds/cudo.py,sha256=H4VyMo5wWGAv2MXZ3xsbWjlZA_cZYnt4ecNlTOOao8Y,13147
48
48
  sky/clouds/fluidstack.py,sha256=iOmoOx52yTrHKMzwBDaxFJCfNo79M61d5tj-Np24Lyc,12436
49
- sky/clouds/gcp.py,sha256=CrSsaSXd83tM78foKH9viBfW1cQsjve3aUQbshsqvDg,54033
49
+ sky/clouds/gcp.py,sha256=FKHqtF4YMY06pseloMEbnt4XwIQ5ErDLlrvyXzIzZa4,54308
50
50
  sky/clouds/ibm.py,sha256=M8QdjeSFlwssfoY2aOodxG4q5R3eT9K-4lTPDHYvEYI,21476
51
51
  sky/clouds/kubernetes.py,sha256=aWoXWR-S4puZHzuUHroLKxLdTpkqU7j75dQlXECnsmE,28679
52
52
  sky/clouds/lambda_cloud.py,sha256=2Al3qCSl-I4iTi7pPPNXcbaLyVfCUgTl__vYBunLB6k,12439
@@ -110,7 +110,7 @@ sky/provision/docker_utils.py,sha256=Z7vDUs9Yjqks_CsWrACcTgABIZuFi3EJVFwkU0WsdD0
110
110
  sky/provision/instance_setup.py,sha256=n1Px_KOYZl7Rf1WLXrfTTHyqxyA8_5QTN9BNLjQRkgc,22427
111
111
  sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
112
112
  sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
113
- sky/provision/provisioner.py,sha256=7BatzBobHGCLJf2u5hCt4VveS9tOPc5zL4wGHR6OcqE,24888
113
+ sky/provision/provisioner.py,sha256=A4-yY0Q4GnkdJsHl_DLNEycq5wFKFsPwT0fwTNh1dG0,25016
114
114
  sky/provision/aws/__init__.py,sha256=mxq8PeWJqUtalDozTNpbtENErRZ1ktEs8uf2aG9UUgU,731
115
115
  sky/provision/aws/config.py,sha256=ApEh63RR_KyCp9nPXX35z6jBREoulJPQ5st8K9Jlclo,23385
116
116
  sky/provision/aws/instance.py,sha256=eCslJ2XfJo_pkQMnKFQqhGnUIRvwKiT12oxBY5-klss,40750
@@ -141,7 +141,7 @@ sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2v
141
141
  sky/provision/kubernetes/instance.py,sha256=Qth9AWc8OBGB7WeGJ4ERlopNA8y2wg1AvS5XPJEuXXQ,38421
142
142
  sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
143
143
  sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
144
- sky/provision/kubernetes/utils.py,sha256=oJgCrbR8IyTw_uMoM9oxYYYNN_tZ1yzppWNTDM-XqaM,84522
144
+ sky/provision/kubernetes/utils.py,sha256=2N5c4yA7CEn4DjvCiUO73W4XDEjgixcJRVdgs913QQE,89523
145
145
  sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
146
146
  sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
147
147
  sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByxQx4Ez6g,598
@@ -187,7 +187,7 @@ sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
187
187
  sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
188
188
  sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,4478
189
189
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
190
- sky/skylet/constants.py,sha256=txAK0602kGtaD42JUYECq5u4rLIZFhOIWz-fLUV1KgA,14652
190
+ sky/skylet/constants.py,sha256=OsuJcQp6UgkQ9Yfml6f_raXXbHS7-_h-v4QNv92y0Gw,14642
191
191
  sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
192
192
  sky/skylet/job_lib.py,sha256=csXhJ6lvAxjzSy2PzZvr0sPJyEoVkCvPi2Zjq9JZkHY,35884
193
193
  sky/skylet/log_lib.py,sha256=7rdmEr5Wy9CIGLkWls5-FVeqTL1O2GPy44uDaEAqMS0,19312
@@ -258,10 +258,10 @@ sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
258
258
  sky/utils/schemas.py,sha256=QT0Fxri2o0SiWkky1DlZhA1dzQRQoB5OdVaej0wJvhc,28787
259
259
  sky/utils/subprocess_utils.py,sha256=3R54Elc2n8DQeO6Y8MCDJ6N6v27HDGpbNMIfCquqXYQ,6552
260
260
  sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
261
- sky/utils/ux_utils.py,sha256=0Fy20UbM2rQ_aVIx_B2-oiLXGmzC1myn-0pnmb56pcA,5227
261
+ sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
262
262
  sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
263
263
  sky/utils/cli_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
264
- sky/utils/cli_utils/status_utils.py,sha256=zIaiGWtEACdczubiSBMTupueZStPt1VqgRrhQilLEaI,17954
264
+ sky/utils/cli_utils/status_utils.py,sha256=2HrH6IBJCJ__AbuZ0ooIEgarBKIVIA5M3myE5qYvToU,12330
265
265
  sky/utils/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
266
266
  sky/utils/kubernetes/create_cluster.sh,sha256=VLXfazav9XCMQmeKVqhuOQzt2vM6G1jgnvvb0SHUFno,7773
267
267
  sky/utils/kubernetes/delete_cluster.sh,sha256=BSccHF43GyepDNf-FZcenzHzpXXATkVD92vgn1lWPgk,927
@@ -273,9 +273,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=KPqp23B-zQ2SZK03jdHeF9fLTog
273
273
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
274
274
  sky/utils/kubernetes/rsync_helper.sh,sha256=aRMa_0JRHtXFOPtEg4rFAwR1t57wvvAoGZhn3H3BtGk,1059
275
275
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
276
- skypilot_nightly-1.0.0.dev20241014.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
277
- skypilot_nightly-1.0.0.dev20241014.dist-info/METADATA,sha256=oTglaVnp9O-PgRAk74UFb1rMIKKjm9z896VnhSIV83g,18945
278
- skypilot_nightly-1.0.0.dev20241014.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
279
- skypilot_nightly-1.0.0.dev20241014.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
280
- skypilot_nightly-1.0.0.dev20241014.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
281
- skypilot_nightly-1.0.0.dev20241014.dist-info/RECORD,,
276
+ skypilot_nightly-1.0.0.dev20241016.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
277
+ skypilot_nightly-1.0.0.dev20241016.dist-info/METADATA,sha256=3cBwGMlr5S-mHHm8ZXtnMUNjCcSYeAzwOEf-N4LxLEU,18945
278
+ skypilot_nightly-1.0.0.dev20241016.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
279
+ skypilot_nightly-1.0.0.dev20241016.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
280
+ skypilot_nightly-1.0.0.dev20241016.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
281
+ skypilot_nightly-1.0.0.dev20241016.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: setuptools (75.2.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5