skypilot-nightly 1.0.0.dev20241014__py3-none-any.whl → 1.0.0.dev20241015__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +3 -3
- sky/cli.py +3 -49
- sky/core.py +76 -1
- sky/provision/kubernetes/utils.py +113 -0
- sky/provision/provisioner.py +4 -1
- sky/utils/cli_utils/status_utils.py +15 -134
- sky/utils/ux_utils.py +32 -8
- {skypilot_nightly-1.0.0.dev20241014.dist-info → skypilot_nightly-1.0.0.dev20241015.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241014.dist-info → skypilot_nightly-1.0.0.dev20241015.dist-info}/RECORD +15 -15
- {skypilot_nightly-1.0.0.dev20241014.dist-info → skypilot_nightly-1.0.0.dev20241015.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241014.dist-info → skypilot_nightly-1.0.0.dev20241015.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241014.dist-info → skypilot_nightly-1.0.0.dev20241015.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241014.dist-info → skypilot_nightly-1.0.0.dev20241015.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'a4e2fcd438d70373377c85bcbec1b185ef04c99f'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241015'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/backends/backend_utils.py
CHANGED
@@ -56,7 +56,7 @@ from sky.utils import timeline
|
|
56
56
|
from sky.utils import ux_utils
|
57
57
|
|
58
58
|
if typing.TYPE_CHECKING:
|
59
|
-
from sky import resources
|
59
|
+
from sky import resources as resources_lib
|
60
60
|
from sky import task as task_lib
|
61
61
|
from sky.backends import cloud_vm_ray_backend
|
62
62
|
from sky.backends import local_docker_backend
|
@@ -751,7 +751,7 @@ def _replace_yaml_dicts(
|
|
751
751
|
# TODO: too many things happening here - leaky abstraction. Refactor.
|
752
752
|
@timeline.event
|
753
753
|
def write_cluster_config(
|
754
|
-
to_provision: '
|
754
|
+
to_provision: 'resources_lib.Resources',
|
755
755
|
num_nodes: int,
|
756
756
|
cluster_config_template: str,
|
757
757
|
cluster_name: str,
|
@@ -2844,9 +2844,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2844
2844
|
time.sleep(gap_seconds)
|
2845
2845
|
continue
|
2846
2846
|
logger.error(
|
2847
|
-
|
2848
|
-
|
2849
|
-
|
2847
|
+
ux_utils.error_message(
|
2848
|
+
'Failed to provision resources. '
|
2849
|
+
f'{ux_utils.log_path_hint(log_path)}'))
|
2850
2850
|
error_message += (
|
2851
2851
|
'\nTo keep retrying until the cluster is up, use '
|
2852
2852
|
'the `--retry-until-up` flag.')
|
sky/cli.py
CHANGED
@@ -1464,54 +1464,8 @@ def _status_kubernetes(show_all: bool):
|
|
1464
1464
|
Args:
|
1465
1465
|
show_all (bool): Show all job information (e.g., start time, failures).
|
1466
1466
|
"""
|
1467
|
-
context =
|
1468
|
-
|
1469
|
-
pods = kubernetes_utils.get_skypilot_pods(context)
|
1470
|
-
except exceptions.ResourcesUnavailableError as e:
|
1471
|
-
with ux_utils.print_exception_no_traceback():
|
1472
|
-
raise ValueError('Failed to get SkyPilot pods from '
|
1473
|
-
f'Kubernetes: {str(e)}') from e
|
1474
|
-
all_clusters, jobs_controllers, serve_controllers = (
|
1475
|
-
status_utils.process_skypilot_pods(pods, context))
|
1476
|
-
all_jobs = []
|
1477
|
-
with rich_utils.safe_status(
|
1478
|
-
'[bold cyan]Checking in-progress managed jobs[/]') as spinner:
|
1479
|
-
for i, (_, job_controller_info) in enumerate(jobs_controllers.items()):
|
1480
|
-
user = job_controller_info['user']
|
1481
|
-
pod = job_controller_info['pods'][0]
|
1482
|
-
status_message = ('[bold cyan]Checking managed jobs controller')
|
1483
|
-
if len(jobs_controllers) > 1:
|
1484
|
-
status_message += f's ({i+1}/{len(jobs_controllers)})'
|
1485
|
-
spinner.update(f'{status_message}[/]')
|
1486
|
-
try:
|
1487
|
-
job_list = managed_jobs.queue_from_kubernetes_pod(
|
1488
|
-
pod.metadata.name)
|
1489
|
-
except RuntimeError as e:
|
1490
|
-
logger.warning('Failed to get managed jobs from controller '
|
1491
|
-
f'{pod.metadata.name}: {str(e)}')
|
1492
|
-
job_list = []
|
1493
|
-
# Add user field to jobs
|
1494
|
-
for job in job_list:
|
1495
|
-
job['user'] = user
|
1496
|
-
all_jobs.extend(job_list)
|
1497
|
-
# Reconcile cluster state between managed jobs and clusters:
|
1498
|
-
# To maintain a clear separation between regular SkyPilot clusters
|
1499
|
-
# and those from managed jobs, we need to exclude the latter from
|
1500
|
-
# the main cluster list.
|
1501
|
-
# We do this by reconstructing managed job cluster names from each
|
1502
|
-
# job's name and ID. We then use this set to filter out managed
|
1503
|
-
# clusters from the main cluster list. This is necessary because there
|
1504
|
-
# are no identifiers distinguishing clusters from managed jobs from
|
1505
|
-
# regular clusters.
|
1506
|
-
managed_job_cluster_names = set()
|
1507
|
-
for job in all_jobs:
|
1508
|
-
# Managed job cluster name is <job_name>-<job_id>
|
1509
|
-
managed_cluster_name = f'{job["job_name"]}-{job["job_id"]}'
|
1510
|
-
managed_job_cluster_names.add(managed_cluster_name)
|
1511
|
-
unmanaged_clusters = [
|
1512
|
-
c for c in all_clusters
|
1513
|
-
if c['cluster_name'] not in managed_job_cluster_names
|
1514
|
-
]
|
1467
|
+
all_clusters, unmanaged_clusters, all_jobs, context = (
|
1468
|
+
core.status_kubernetes())
|
1515
1469
|
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1516
1470
|
f'Kubernetes cluster state (context: {context})'
|
1517
1471
|
f'{colorama.Style.RESET_ALL}')
|
@@ -1523,7 +1477,7 @@ def _status_kubernetes(show_all: bool):
|
|
1523
1477
|
f'{colorama.Style.RESET_ALL}')
|
1524
1478
|
msg = managed_jobs.format_job_table(all_jobs, show_all=show_all)
|
1525
1479
|
click.echo(msg)
|
1526
|
-
if
|
1480
|
+
if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
|
1527
1481
|
# TODO: Parse serve controllers and show services separately.
|
1528
1482
|
# Currently we show a hint that services are shown as clusters.
|
1529
1483
|
click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
|
sky/core.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
"""SDK functions for cluster/job management."""
|
2
2
|
import getpass
|
3
3
|
import typing
|
4
|
-
from typing import Any, Dict, List, Optional, Union
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
5
|
|
6
6
|
import colorama
|
7
7
|
|
@@ -11,10 +11,12 @@ from sky import dag
|
|
11
11
|
from sky import data
|
12
12
|
from sky import exceptions
|
13
13
|
from sky import global_user_state
|
14
|
+
from sky import jobs as managed_jobs
|
14
15
|
from sky import sky_logging
|
15
16
|
from sky import status_lib
|
16
17
|
from sky import task
|
17
18
|
from sky.backends import backend_utils
|
19
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
18
20
|
from sky.skylet import constants
|
19
21
|
from sky.skylet import job_lib
|
20
22
|
from sky.usage import usage_lib
|
@@ -111,6 +113,79 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None,
|
|
111
113
|
cluster_names=cluster_names)
|
112
114
|
|
113
115
|
|
116
|
+
def status_kubernetes(
|
117
|
+
) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfo'],
|
118
|
+
List['kubernetes_utils.KubernetesSkyPilotClusterInfo'], List[Dict[
|
119
|
+
str, Any]], Optional[str]]:
|
120
|
+
"""Get all SkyPilot clusters and jobs in the Kubernetes cluster.
|
121
|
+
|
122
|
+
Managed jobs and services are also included in the clusters returned.
|
123
|
+
The caller must parse the controllers to identify which clusters are run
|
124
|
+
as managed jobs or services.
|
125
|
+
all_clusters, unmanaged_clusters, all_jobs, context
|
126
|
+
Returns:
|
127
|
+
A tuple containing:
|
128
|
+
- all_clusters: List of KubernetesSkyPilotClusterInfo with info for
|
129
|
+
all clusters, including managed jobs, services and controllers.
|
130
|
+
- unmanaged_clusters: List of KubernetesSkyPilotClusterInfo with info
|
131
|
+
for all clusters excluding managed jobs and services. Controllers
|
132
|
+
are included.
|
133
|
+
- all_jobs: List of managed jobs from all controllers. Each entry is a
|
134
|
+
dictionary job info, see jobs.queue_from_kubernetes_pod for details.
|
135
|
+
- context: Kubernetes context used to fetch the cluster information.
|
136
|
+
"""
|
137
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
138
|
+
try:
|
139
|
+
pods = kubernetes_utils.get_skypilot_pods(context)
|
140
|
+
except exceptions.ResourcesUnavailableError as e:
|
141
|
+
with ux_utils.print_exception_no_traceback():
|
142
|
+
raise ValueError('Failed to get SkyPilot pods from '
|
143
|
+
f'Kubernetes: {str(e)}') from e
|
144
|
+
all_clusters, jobs_controllers, _ = (kubernetes_utils.process_skypilot_pods(
|
145
|
+
pods, context))
|
146
|
+
all_jobs = []
|
147
|
+
with rich_utils.safe_status(
|
148
|
+
ux_utils.spinner_message(
|
149
|
+
'[bold cyan]Checking in-progress managed jobs[/]')) as spinner:
|
150
|
+
for i, job_controller_info in enumerate(jobs_controllers):
|
151
|
+
user = job_controller_info.user
|
152
|
+
pod = job_controller_info.pods[0]
|
153
|
+
status_message = '[bold cyan]Checking managed jobs controller'
|
154
|
+
if len(jobs_controllers) > 1:
|
155
|
+
status_message += f's ({i + 1}/{len(jobs_controllers)})'
|
156
|
+
spinner.update(f'{status_message}[/]')
|
157
|
+
try:
|
158
|
+
job_list = managed_jobs.queue_from_kubernetes_pod(
|
159
|
+
pod.metadata.name)
|
160
|
+
except RuntimeError as e:
|
161
|
+
logger.warning('Failed to get managed jobs from controller '
|
162
|
+
f'{pod.metadata.name}: {str(e)}')
|
163
|
+
job_list = []
|
164
|
+
# Add user field to jobs
|
165
|
+
for job in job_list:
|
166
|
+
job['user'] = user
|
167
|
+
all_jobs.extend(job_list)
|
168
|
+
# Reconcile cluster state between managed jobs and clusters:
|
169
|
+
# To maintain a clear separation between regular SkyPilot clusters
|
170
|
+
# and those from managed jobs, we need to exclude the latter from
|
171
|
+
# the main cluster list.
|
172
|
+
# We do this by reconstructing managed job cluster names from each
|
173
|
+
# job's name and ID. We then use this set to filter out managed
|
174
|
+
# clusters from the main cluster list. This is necessary because there
|
175
|
+
# are no identifiers distinguishing clusters from managed jobs from
|
176
|
+
# regular clusters.
|
177
|
+
managed_job_cluster_names = set()
|
178
|
+
for job in all_jobs:
|
179
|
+
# Managed job cluster name is <job_name>-<job_id>
|
180
|
+
managed_cluster_name = f'{job["job_name"]}-{job["job_id"]}'
|
181
|
+
managed_job_cluster_names.add(managed_cluster_name)
|
182
|
+
unmanaged_clusters = [
|
183
|
+
c for c in all_clusters
|
184
|
+
if c.cluster_name not in managed_job_cluster_names
|
185
|
+
]
|
186
|
+
return all_clusters, unmanaged_clusters, all_jobs, context
|
187
|
+
|
188
|
+
|
114
189
|
def endpoints(cluster: str,
|
115
190
|
port: Optional[Union[int, str]] = None) -> Dict[int, str]:
|
116
191
|
"""Gets the endpoint for a given cluster and port number (endpoint).
|
@@ -15,9 +15,11 @@ import jinja2
|
|
15
15
|
import yaml
|
16
16
|
|
17
17
|
import sky
|
18
|
+
from sky import clouds
|
18
19
|
from sky import exceptions
|
19
20
|
from sky import sky_logging
|
20
21
|
from sky import skypilot_config
|
22
|
+
from sky import status_lib
|
21
23
|
from sky.adaptors import kubernetes
|
22
24
|
from sky.provision import constants as provision_constants
|
23
25
|
from sky.provision.kubernetes import network_utils
|
@@ -30,6 +32,7 @@ from sky.utils import ux_utils
|
|
30
32
|
|
31
33
|
if typing.TYPE_CHECKING:
|
32
34
|
from sky import backends
|
35
|
+
from sky import resources as resources_lib
|
33
36
|
|
34
37
|
# TODO(romilb): Move constants to constants.py
|
35
38
|
DEFAULT_NAMESPACE = 'default'
|
@@ -2023,3 +2026,113 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
|
|
2023
2026
|
'kubectl get pods --selector=skypilot-cluster --all-namespaces'
|
2024
2027
|
) from None
|
2025
2028
|
return pods
|
2029
|
+
|
2030
|
+
|
2031
|
+
@dataclasses.dataclass
|
2032
|
+
class KubernetesSkyPilotClusterInfo:
|
2033
|
+
cluster_name_on_cloud: str
|
2034
|
+
cluster_name: str
|
2035
|
+
user: str
|
2036
|
+
status: status_lib.ClusterStatus
|
2037
|
+
pods: List[Any]
|
2038
|
+
launched_at: float
|
2039
|
+
resources: 'resources_lib.Resources'
|
2040
|
+
resources_str: str
|
2041
|
+
|
2042
|
+
|
2043
|
+
def process_skypilot_pods(
|
2044
|
+
pods: List[Any],
|
2045
|
+
context: Optional[str] = None
|
2046
|
+
) -> Tuple[List[KubernetesSkyPilotClusterInfo],
|
2047
|
+
List[KubernetesSkyPilotClusterInfo],
|
2048
|
+
List[KubernetesSkyPilotClusterInfo]]:
|
2049
|
+
"""Process SkyPilot pods on k8s to extract cluster and controller info.
|
2050
|
+
|
2051
|
+
Args:
|
2052
|
+
pods: List of Kubernetes pod objects.
|
2053
|
+
context: Kubernetes context name, used to detect GPU label formatter.
|
2054
|
+
|
2055
|
+
Returns:
|
2056
|
+
A tuple containing:
|
2057
|
+
- List of KubernetesSkyPilotClusterInfo with all cluster info.
|
2058
|
+
- List of KubernetesSkyPilotClusterInfo with job controller info.
|
2059
|
+
- List of KubernetesSkyPilotClusterInfo with serve controller info.
|
2060
|
+
"""
|
2061
|
+
# pylint: disable=import-outside-toplevel
|
2062
|
+
from sky import resources as resources_lib
|
2063
|
+
clusters: Dict[str, KubernetesSkyPilotClusterInfo] = {}
|
2064
|
+
jobs_controllers: List[KubernetesSkyPilotClusterInfo] = []
|
2065
|
+
serve_controllers: List[KubernetesSkyPilotClusterInfo] = []
|
2066
|
+
|
2067
|
+
for pod in pods:
|
2068
|
+
cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
|
2069
|
+
cluster_name = cluster_name_on_cloud.rsplit(
|
2070
|
+
'-', 1
|
2071
|
+
)[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
|
2072
|
+
if cluster_name_on_cloud not in clusters:
|
2073
|
+
# Parse the start time for the cluster
|
2074
|
+
start_time = pod.status.start_time
|
2075
|
+
if start_time is not None:
|
2076
|
+
start_time = pod.status.start_time.timestamp()
|
2077
|
+
|
2078
|
+
# Parse resources
|
2079
|
+
cpu_request = parse_cpu_or_gpu_resource(
|
2080
|
+
pod.spec.containers[0].resources.requests.get('cpu', '0'))
|
2081
|
+
memory_request = parse_memory_resource(
|
2082
|
+
pod.spec.containers[0].resources.requests.get('memory', '0'),
|
2083
|
+
unit='G')
|
2084
|
+
gpu_count = parse_cpu_or_gpu_resource(
|
2085
|
+
pod.spec.containers[0].resources.requests.get(
|
2086
|
+
'nvidia.com/gpu', '0'))
|
2087
|
+
gpu_name = None
|
2088
|
+
if gpu_count > 0:
|
2089
|
+
label_formatter, _ = (detect_gpu_label_formatter(context))
|
2090
|
+
assert label_formatter is not None, (
|
2091
|
+
'GPU label formatter cannot be None if there are pods '
|
2092
|
+
f'requesting GPUs: {pod.metadata.name}')
|
2093
|
+
gpu_label = label_formatter.get_label_key()
|
2094
|
+
# Get GPU name from pod node selector
|
2095
|
+
if pod.spec.node_selector is not None:
|
2096
|
+
gpu_name = label_formatter.get_accelerator_from_label_value(
|
2097
|
+
pod.spec.node_selector.get(gpu_label))
|
2098
|
+
|
2099
|
+
resources = resources_lib.Resources(
|
2100
|
+
cloud=clouds.Kubernetes(),
|
2101
|
+
cpus=int(cpu_request),
|
2102
|
+
memory=int(memory_request),
|
2103
|
+
accelerators=(f'{gpu_name}:{gpu_count}'
|
2104
|
+
if gpu_count > 0 else None))
|
2105
|
+
if pod.status.phase == 'Pending':
|
2106
|
+
# If pod is pending, do not show it in the status
|
2107
|
+
continue
|
2108
|
+
|
2109
|
+
cluster_info = KubernetesSkyPilotClusterInfo(
|
2110
|
+
cluster_name_on_cloud=cluster_name_on_cloud,
|
2111
|
+
cluster_name=cluster_name,
|
2112
|
+
user=pod.metadata.labels.get('skypilot-user'),
|
2113
|
+
status=status_lib.ClusterStatus.UP,
|
2114
|
+
pods=[],
|
2115
|
+
launched_at=start_time,
|
2116
|
+
resources=resources,
|
2117
|
+
resources_str='')
|
2118
|
+
clusters[cluster_name_on_cloud] = cluster_info
|
2119
|
+
# Check if cluster name is name of a controller
|
2120
|
+
# Can't use controller_utils.Controllers.from_name(cluster_name)
|
2121
|
+
# because hash is different across users
|
2122
|
+
if 'sky-jobs-controller' in cluster_name_on_cloud:
|
2123
|
+
jobs_controllers.append(cluster_info)
|
2124
|
+
elif 'sky-serve-controller' in cluster_name_on_cloud:
|
2125
|
+
serve_controllers.append(cluster_info)
|
2126
|
+
else:
|
2127
|
+
# Update start_time if this pod started earlier
|
2128
|
+
pod_start_time = pod.status.start_time
|
2129
|
+
if pod_start_time is not None:
|
2130
|
+
pod_start_time = pod_start_time.timestamp()
|
2131
|
+
if pod_start_time < clusters[cluster_name_on_cloud].launched_at:
|
2132
|
+
clusters[cluster_name_on_cloud].launched_at = pod_start_time
|
2133
|
+
clusters[cluster_name_on_cloud].pods.append(pod)
|
2134
|
+
# Update resources_str in clusters:
|
2135
|
+
for cluster in clusters.values():
|
2136
|
+
num_pods = len(cluster.pods)
|
2137
|
+
cluster.resources_str = f'{num_pods}x {cluster.resources}'
|
2138
|
+
return list(clusters.values()), jobs_controllers, serve_controllers
|
sky/provision/provisioner.py
CHANGED
@@ -571,7 +571,10 @@ def post_provision_runtime_setup(
|
|
571
571
|
provision_record=provision_record,
|
572
572
|
custom_resource=custom_resource)
|
573
573
|
except Exception: # pylint: disable=broad-except
|
574
|
-
logger.error(
|
574
|
+
logger.error(
|
575
|
+
ux_utils.error_message(
|
576
|
+
'Failed to set up SkyPilot runtime on cluster.',
|
577
|
+
provision_logging.config.log_path))
|
575
578
|
logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
|
576
579
|
with ux_utils.print_exception_no_traceback():
|
577
580
|
raise
|
@@ -1,19 +1,20 @@
|
|
1
1
|
"""Utilities for sky status."""
|
2
|
-
|
2
|
+
import typing
|
3
|
+
from typing import Any, Callable, Dict, List, Optional
|
3
4
|
|
4
5
|
import click
|
5
6
|
import colorama
|
6
7
|
|
7
8
|
from sky import backends
|
8
|
-
from sky import clouds as sky_clouds
|
9
|
-
from sky import resources as resources_lib
|
10
9
|
from sky import status_lib
|
11
|
-
from sky.provision.kubernetes import utils as kubernetes_utils
|
12
10
|
from sky.skylet import constants
|
13
11
|
from sky.utils import common_utils
|
14
12
|
from sky.utils import log_utils
|
15
13
|
from sky.utils import resources_utils
|
16
14
|
|
15
|
+
if typing.TYPE_CHECKING:
|
16
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
17
|
+
|
17
18
|
COMMAND_TRUNC_LENGTH = 25
|
18
19
|
NUM_COST_REPORT_LINES = 5
|
19
20
|
|
@@ -303,19 +304,19 @@ def _get_estimated_cost_for_cost_report(
|
|
303
304
|
return f'$ {cost:.2f}'
|
304
305
|
|
305
306
|
|
306
|
-
def show_kubernetes_cluster_status_table(
|
307
|
-
|
307
|
+
def show_kubernetes_cluster_status_table(
|
308
|
+
clusters: List['kubernetes_utils.KubernetesSkyPilotClusterInfo'],
|
309
|
+
show_all: bool) -> None:
|
308
310
|
"""Compute cluster table values and display for Kubernetes clusters."""
|
309
311
|
status_columns = [
|
310
|
-
StatusColumn('USER', lambda c: c
|
311
|
-
StatusColumn('NAME', lambda c: c
|
312
|
-
StatusColumn(
|
313
|
-
|
314
|
-
lambda c: log_utils.readable_time_duration(c['launched_at'])),
|
312
|
+
StatusColumn('USER', lambda c: c.user),
|
313
|
+
StatusColumn('NAME', lambda c: c.cluster_name),
|
314
|
+
StatusColumn('LAUNCHED',
|
315
|
+
lambda c: log_utils.readable_time_duration(c.launched_at)),
|
315
316
|
StatusColumn('RESOURCES',
|
316
|
-
lambda c: c
|
317
|
+
lambda c: c.resources_str,
|
317
318
|
trunc_length=70 if not show_all else 0),
|
318
|
-
StatusColumn('STATUS', lambda c: c
|
319
|
+
StatusColumn('STATUS', lambda c: c.status.colored_str()),
|
319
320
|
# TODO(romilb): We should consider adding POD_NAME field here when --all
|
320
321
|
# is passed to help users fetch pod name programmatically.
|
321
322
|
]
|
@@ -326,8 +327,7 @@ def show_kubernetes_cluster_status_table(clusters: List[Any],
|
|
326
327
|
cluster_table = log_utils.create_table(columns)
|
327
328
|
|
328
329
|
# Sort table by user, then by cluster name
|
329
|
-
sorted_clusters = sorted(clusters,
|
330
|
-
key=lambda c: (c['user'], c['cluster_name']))
|
330
|
+
sorted_clusters = sorted(clusters, key=lambda c: (c.user, c.cluster_name))
|
331
331
|
|
332
332
|
for cluster in sorted_clusters:
|
333
333
|
row = []
|
@@ -344,122 +344,3 @@ def show_kubernetes_cluster_status_table(clusters: List[Any],
|
|
344
344
|
else:
|
345
345
|
click.echo('No SkyPilot resources found in the '
|
346
346
|
'active Kubernetes context.')
|
347
|
-
|
348
|
-
|
349
|
-
def process_skypilot_pods(
|
350
|
-
pods: List[Any],
|
351
|
-
context: Optional[str] = None
|
352
|
-
) -> Tuple[List[Dict[Any, Any]], Dict[str, Any], Dict[str, Any]]:
|
353
|
-
"""Process SkyPilot pods on k8s to extract cluster and controller info.
|
354
|
-
|
355
|
-
Args:
|
356
|
-
pods: List of Kubernetes pod objects.
|
357
|
-
context: Kubernetes context name, used to detect GPU label formatter.
|
358
|
-
|
359
|
-
Returns:
|
360
|
-
A tuple containing:
|
361
|
-
- List of dictionaries with cluster information.
|
362
|
-
- Dictionary of job controller information.
|
363
|
-
- Dictionary of serve controller information.
|
364
|
-
|
365
|
-
Each dictionary contains the following keys:
|
366
|
-
'cluster_name_on_cloud': The cluster_name_on_cloud used by SkyPilot
|
367
|
-
'cluster_name': The cluster name without the user hash
|
368
|
-
'user': The user who created the cluster. Fetched from pod label
|
369
|
-
'status': The cluster status (assumed UP if pod exists)
|
370
|
-
'pods': List of pod objects in the cluster
|
371
|
-
'launched_at': Timestamp of when the cluster was launched
|
372
|
-
'resources': sky.Resources object for the cluster
|
373
|
-
"""
|
374
|
-
clusters: Dict[str, Dict] = {}
|
375
|
-
jobs_controllers: Dict[str, Dict] = {}
|
376
|
-
serve_controllers: Dict[str, Dict] = {}
|
377
|
-
|
378
|
-
for pod in pods:
|
379
|
-
cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
|
380
|
-
cluster_name = cluster_name_on_cloud.rsplit(
|
381
|
-
'-', 1
|
382
|
-
)[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
|
383
|
-
|
384
|
-
# Check if cluster name is name of a controller
|
385
|
-
# Can't use controller_utils.Controllers.from_name(cluster_name)
|
386
|
-
# because hash is different across users
|
387
|
-
if 'controller' in cluster_name_on_cloud:
|
388
|
-
start_time = pod.status.start_time.timestamp()
|
389
|
-
controller_info = {
|
390
|
-
'cluster_name_on_cloud': cluster_name_on_cloud,
|
391
|
-
'cluster_name': cluster_name,
|
392
|
-
'user': pod.metadata.labels.get('skypilot-user'),
|
393
|
-
'status': status_lib.ClusterStatus.UP,
|
394
|
-
# Assuming UP if pod exists
|
395
|
-
'pods': [pod],
|
396
|
-
'launched_at': start_time
|
397
|
-
}
|
398
|
-
if 'sky-jobs-controller' in cluster_name_on_cloud:
|
399
|
-
jobs_controllers[cluster_name_on_cloud] = controller_info
|
400
|
-
elif 'sky-serve-controller' in cluster_name_on_cloud:
|
401
|
-
serve_controllers[cluster_name_on_cloud] = controller_info
|
402
|
-
|
403
|
-
if cluster_name_on_cloud not in clusters:
|
404
|
-
# Parse the start time for the cluster
|
405
|
-
start_time = pod.status.start_time
|
406
|
-
if start_time is not None:
|
407
|
-
start_time = pod.status.start_time.timestamp()
|
408
|
-
|
409
|
-
# Parse resources
|
410
|
-
cpu_request = kubernetes_utils.parse_cpu_or_gpu_resource(
|
411
|
-
pod.spec.containers[0].resources.requests.get('cpu', '0'))
|
412
|
-
memory_request = kubernetes_utils.parse_memory_resource(
|
413
|
-
pod.spec.containers[0].resources.requests.get('memory', '0'),
|
414
|
-
unit='G')
|
415
|
-
gpu_count = kubernetes_utils.parse_cpu_or_gpu_resource(
|
416
|
-
pod.spec.containers[0].resources.requests.get(
|
417
|
-
'nvidia.com/gpu', '0'))
|
418
|
-
if gpu_count > 0:
|
419
|
-
label_formatter, _ = (
|
420
|
-
kubernetes_utils.detect_gpu_label_formatter(context))
|
421
|
-
assert label_formatter is not None, (
|
422
|
-
'GPU label formatter cannot be None if there are pods '
|
423
|
-
f'requesting GPUs: {pod.metadata.name}')
|
424
|
-
gpu_label = label_formatter.get_label_key()
|
425
|
-
# Get GPU name from pod node selector
|
426
|
-
if pod.spec.node_selector is not None:
|
427
|
-
gpu_name = label_formatter.get_accelerator_from_label_value(
|
428
|
-
pod.spec.node_selector.get(gpu_label))
|
429
|
-
|
430
|
-
resources = resources_lib.Resources(
|
431
|
-
cloud=sky_clouds.Kubernetes(),
|
432
|
-
cpus=int(cpu_request),
|
433
|
-
memory=int(memory_request),
|
434
|
-
accelerators=(f'{gpu_name}:{gpu_count}'
|
435
|
-
if gpu_count > 0 else None))
|
436
|
-
if pod.status.phase == 'Pending':
|
437
|
-
# If pod is pending, do not show it in the status
|
438
|
-
continue
|
439
|
-
|
440
|
-
clusters[cluster_name_on_cloud] = {
|
441
|
-
'cluster_name_on_cloud': cluster_name_on_cloud,
|
442
|
-
'cluster_name': cluster_name,
|
443
|
-
'user': pod.metadata.labels.get('skypilot-user'),
|
444
|
-
'status': status_lib.ClusterStatus.UP,
|
445
|
-
'pods': [],
|
446
|
-
'launched_at': start_time,
|
447
|
-
'resources': resources,
|
448
|
-
}
|
449
|
-
else:
|
450
|
-
# Update start_time if this pod started earlier
|
451
|
-
pod_start_time = pod.status.start_time
|
452
|
-
if pod_start_time is not None:
|
453
|
-
pod_start_time = pod_start_time.timestamp()
|
454
|
-
if pod_start_time < clusters[cluster_name_on_cloud][
|
455
|
-
'launched_at']:
|
456
|
-
clusters[cluster_name_on_cloud][
|
457
|
-
'launched_at'] = pod_start_time
|
458
|
-
clusters[cluster_name_on_cloud]['pods'].append(pod)
|
459
|
-
# Update resources_str in clusters:
|
460
|
-
for cluster_name, cluster in clusters.items():
|
461
|
-
resources = cluster['resources']
|
462
|
-
num_pods = len(cluster['pods'])
|
463
|
-
resources_str = f'{num_pods}x {resources}'
|
464
|
-
cluster['resources_str'] = resources_str
|
465
|
-
return list(clusters.values()), jobs_controllers, serve_controllers
|
sky/utils/ux_utils.py
CHANGED
@@ -121,11 +121,6 @@ class RedirectOutputForProcess:
|
|
121
121
|
raise
|
122
122
|
|
123
123
|
|
124
|
-
def starting_message(message: str) -> str:
|
125
|
-
"""Gets the starting message for the given message."""
|
126
|
-
return f'⚙︎ {message}'
|
127
|
-
|
128
|
-
|
129
124
|
def log_path_hint(log_path: Union[str, 'pathlib.Path']) -> str:
|
130
125
|
"""Gets the log path hint for the given log path."""
|
131
126
|
log_path = str(log_path)
|
@@ -135,21 +130,50 @@ def log_path_hint(log_path: Union[str, 'pathlib.Path']) -> str:
|
|
135
130
|
return _LOG_PATH_HINT.format(log_path=log_path)
|
136
131
|
|
137
132
|
|
133
|
+
def starting_message(message: str) -> str:
|
134
|
+
"""Gets the starting message for the given message."""
|
135
|
+
# We have to reset the color before the message, because sometimes if a
|
136
|
+
# previous spinner with dimmed color overflows in a narrow terminal, the
|
137
|
+
# color might be messed up.
|
138
|
+
return f'{colorama.Style.RESET_ALL}⚙︎ {message}'
|
139
|
+
|
140
|
+
|
138
141
|
def finishing_message(
|
139
142
|
message: str,
|
140
143
|
log_path: Optional[Union[str, 'pathlib.Path']] = None) -> str:
|
141
144
|
"""Gets the finishing message for the given message."""
|
142
|
-
|
143
|
-
|
145
|
+
# We have to reset the color before the message, because sometimes if a
|
146
|
+
# previous spinner with dimmed color overflows in a narrow terminal, the
|
147
|
+
# color might be messed up.
|
148
|
+
success_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.GREEN}✓ '
|
149
|
+
f'{message}{colorama.Style.RESET_ALL}')
|
144
150
|
if log_path is None:
|
145
151
|
return success_prefix
|
146
152
|
path_hint = log_path_hint(log_path)
|
147
153
|
return f'{success_prefix} {path_hint}'
|
148
154
|
|
149
155
|
|
156
|
+
def error_message(message: str,
|
157
|
+
log_path: Optional[Union[str, 'pathlib.Path']] = None) -> str:
|
158
|
+
"""Gets the error message for the given message."""
|
159
|
+
# We have to reset the color before the message, because sometimes if a
|
160
|
+
# previous spinner with dimmed color overflows in a narrow terminal, the
|
161
|
+
# color might be messed up.
|
162
|
+
error_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.RED}⨯'
|
163
|
+
f'{colorama.Style.RESET_ALL} {message}')
|
164
|
+
if log_path is None:
|
165
|
+
return error_prefix
|
166
|
+
path_hint = log_path_hint(log_path)
|
167
|
+
return f'{error_prefix} {path_hint}'
|
168
|
+
|
169
|
+
|
150
170
|
def retry_message(message: str) -> str:
|
151
171
|
"""Gets the retry message for the given message."""
|
152
|
-
|
172
|
+
# We have to reset the color before the message, because sometimes if a
|
173
|
+
# previous spinner with dimmed color overflows in a narrow terminal, the
|
174
|
+
# color might be messed up.
|
175
|
+
return (f'{colorama.Style.RESET_ALL}{colorama.Fore.YELLOW}↺'
|
176
|
+
f'{colorama.Style.RESET_ALL} {message}')
|
153
177
|
|
154
178
|
|
155
179
|
def spinner_message(
|
{skypilot_nightly-1.0.0.dev20241014.dist-info → skypilot_nightly-1.0.0.dev20241015.dist-info}/RECORD
RENAMED
@@ -1,10 +1,10 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=wp2_9HrUWrDIKzYNWBHPGOeQdgEfO8jWVmulsFY4BLY,5854
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=TfKkVnmRIetATSEVQFp-rOOIRGqVig2i8faSQQt_ixA,20974
|
4
4
|
sky/check.py,sha256=jLMIIJrseaZj1_o5WkbaD9XdyXIlCaT6pyAaIFdhdmA,9079
|
5
|
-
sky/cli.py,sha256=
|
5
|
+
sky/cli.py,sha256=PJR6W92twf89j17OWLQJ9RawdazJcGslfW2L_fLB2PM,208545
|
6
6
|
sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
|
7
|
-
sky/core.py,sha256=
|
7
|
+
sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
|
8
8
|
sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
|
9
9
|
sky/exceptions.py,sha256=D7WARzYRt4dGjXo6gI-gzkoodZbKF1D-qncm_DbHB28,8846
|
10
10
|
sky/execution.py,sha256=CbrKMgfc2JgLqZqwPvmYKxbWAQKYqHpOLpUEOb-k2m0,24679
|
@@ -30,8 +30,8 @@ sky/adaptors/runpod.py,sha256=4Nt_BfZhJAKQNA3wO8cxvvNI8x4NsDGHu_4EhRDlGYQ,225
|
|
30
30
|
sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
|
31
31
|
sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
|
32
32
|
sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
|
33
|
-
sky/backends/backend_utils.py,sha256=
|
34
|
-
sky/backends/cloud_vm_ray_backend.py,sha256=
|
33
|
+
sky/backends/backend_utils.py,sha256=u9P7Fd3DB9LaOq51fK7kwKpxtgFGGWmgULY6GoLSUPM,126791
|
34
|
+
sky/backends/cloud_vm_ray_backend.py,sha256=9mCLLRUD-x3ksiiPbhrMDsZWIPNU9cVSQwwpmxSia7k,236881
|
35
35
|
sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
|
36
36
|
sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
|
37
37
|
sky/backends/wheel_utils.py,sha256=3QS4T_Ydvo4DbYhogtyADyNBEf04I6jUCL71M285shQ,7963
|
@@ -110,7 +110,7 @@ sky/provision/docker_utils.py,sha256=Z7vDUs9Yjqks_CsWrACcTgABIZuFi3EJVFwkU0WsdD0
|
|
110
110
|
sky/provision/instance_setup.py,sha256=n1Px_KOYZl7Rf1WLXrfTTHyqxyA8_5QTN9BNLjQRkgc,22427
|
111
111
|
sky/provision/logging.py,sha256=yZWgejrFBhhRjAtvFu5N5bRXIMK5TuwNjp1vKQqz2pw,2103
|
112
112
|
sky/provision/metadata_utils.py,sha256=LrxeV4wD2QPzNdXV_npj8q-pr35FatxBBjF_jSbpOT0,4013
|
113
|
-
sky/provision/provisioner.py,sha256=
|
113
|
+
sky/provision/provisioner.py,sha256=A4-yY0Q4GnkdJsHl_DLNEycq5wFKFsPwT0fwTNh1dG0,25016
|
114
114
|
sky/provision/aws/__init__.py,sha256=mxq8PeWJqUtalDozTNpbtENErRZ1ktEs8uf2aG9UUgU,731
|
115
115
|
sky/provision/aws/config.py,sha256=ApEh63RR_KyCp9nPXX35z6jBREoulJPQ5st8K9Jlclo,23385
|
116
116
|
sky/provision/aws/instance.py,sha256=eCslJ2XfJo_pkQMnKFQqhGnUIRvwKiT12oxBY5-klss,40750
|
@@ -141,7 +141,7 @@ sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2v
|
|
141
141
|
sky/provision/kubernetes/instance.py,sha256=Qth9AWc8OBGB7WeGJ4ERlopNA8y2wg1AvS5XPJEuXXQ,38421
|
142
142
|
sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
|
143
143
|
sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
|
144
|
-
sky/provision/kubernetes/utils.py,sha256=
|
144
|
+
sky/provision/kubernetes/utils.py,sha256=2N5c4yA7CEn4DjvCiUO73W4XDEjgixcJRVdgs913QQE,89523
|
145
145
|
sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
|
146
146
|
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
|
147
147
|
sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByxQx4Ez6g,598
|
@@ -258,10 +258,10 @@ sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
|
|
258
258
|
sky/utils/schemas.py,sha256=QT0Fxri2o0SiWkky1DlZhA1dzQRQoB5OdVaej0wJvhc,28787
|
259
259
|
sky/utils/subprocess_utils.py,sha256=3R54Elc2n8DQeO6Y8MCDJ6N6v27HDGpbNMIfCquqXYQ,6552
|
260
260
|
sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
|
261
|
-
sky/utils/ux_utils.py,sha256=
|
261
|
+
sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
|
262
262
|
sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
|
263
263
|
sky/utils/cli_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
264
|
-
sky/utils/cli_utils/status_utils.py,sha256=
|
264
|
+
sky/utils/cli_utils/status_utils.py,sha256=2HrH6IBJCJ__AbuZ0ooIEgarBKIVIA5M3myE5qYvToU,12330
|
265
265
|
sky/utils/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
266
266
|
sky/utils/kubernetes/create_cluster.sh,sha256=VLXfazav9XCMQmeKVqhuOQzt2vM6G1jgnvvb0SHUFno,7773
|
267
267
|
sky/utils/kubernetes/delete_cluster.sh,sha256=BSccHF43GyepDNf-FZcenzHzpXXATkVD92vgn1lWPgk,927
|
@@ -273,9 +273,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=KPqp23B-zQ2SZK03jdHeF9fLTog
|
|
273
273
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
274
274
|
sky/utils/kubernetes/rsync_helper.sh,sha256=aRMa_0JRHtXFOPtEg4rFAwR1t57wvvAoGZhn3H3BtGk,1059
|
275
275
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
276
|
-
skypilot_nightly-1.0.0.
|
277
|
-
skypilot_nightly-1.0.0.
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
276
|
+
skypilot_nightly-1.0.0.dev20241015.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
277
|
+
skypilot_nightly-1.0.0.dev20241015.dist-info/METADATA,sha256=bi4nYXgxrzZkWNu-u5MEHjWs5J91R9LhI9pG3RuQvqo,18945
|
278
|
+
skypilot_nightly-1.0.0.dev20241015.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
279
|
+
skypilot_nightly-1.0.0.dev20241015.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
280
|
+
skypilot_nightly-1.0.0.dev20241015.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
281
|
+
skypilot_nightly-1.0.0.dev20241015.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20241014.dist-info → skypilot_nightly-1.0.0.dev20241015.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|