skypilot-nightly 1.0.0.dev20241011__py3-none-any.whl → 1.0.0.dev20241012__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +92 -3
- sky/data/storage_utils.py +4 -3
- sky/jobs/__init__.py +2 -0
- sky/jobs/core.py +78 -0
- sky/jobs/utils.py +20 -9
- sky/provision/kubernetes/utils.py +25 -0
- sky/utils/cli_utils/status_utils.py +168 -21
- sky/utils/common_utils.py +20 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241012.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241012.dist-info}/RECORD +15 -15
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241012.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241012.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241012.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241012.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'fdd68b209ee74f9282fac5c6834907d5fe72d255'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241012'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/cli.py
CHANGED
@@ -1458,6 +1458,79 @@ def _get_services(service_names: Optional[List[str]],
|
|
1458
1458
|
return num_services, msg
|
1459
1459
|
|
1460
1460
|
|
1461
|
+
def _status_kubernetes(show_all: bool):
|
1462
|
+
"""Show all SkyPilot resources in the current Kubernetes context.
|
1463
|
+
|
1464
|
+
Args:
|
1465
|
+
show_all (bool): Show all job information (e.g., start time, failures).
|
1466
|
+
"""
|
1467
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
1468
|
+
try:
|
1469
|
+
pods = kubernetes_utils.get_skypilot_pods(context)
|
1470
|
+
except exceptions.ResourcesUnavailableError as e:
|
1471
|
+
with ux_utils.print_exception_no_traceback():
|
1472
|
+
raise ValueError('Failed to get SkyPilot pods from '
|
1473
|
+
f'Kubernetes: {str(e)}') from e
|
1474
|
+
all_clusters, jobs_controllers, serve_controllers = (
|
1475
|
+
status_utils.process_skypilot_pods(pods, context))
|
1476
|
+
all_jobs = []
|
1477
|
+
with rich_utils.safe_status(
|
1478
|
+
'[bold cyan]Checking in-progress managed jobs[/]') as spinner:
|
1479
|
+
for i, (_, job_controller_info) in enumerate(jobs_controllers.items()):
|
1480
|
+
user = job_controller_info['user']
|
1481
|
+
pod = job_controller_info['pods'][0]
|
1482
|
+
status_message = ('[bold cyan]Checking managed jobs controller')
|
1483
|
+
if len(jobs_controllers) > 1:
|
1484
|
+
status_message += f's ({i+1}/{len(jobs_controllers)})'
|
1485
|
+
spinner.update(f'{status_message}[/]')
|
1486
|
+
try:
|
1487
|
+
job_list = managed_jobs.queue_from_kubernetes_pod(
|
1488
|
+
pod.metadata.name)
|
1489
|
+
except RuntimeError as e:
|
1490
|
+
logger.warning('Failed to get managed jobs from controller '
|
1491
|
+
f'{pod.metadata.name}: {str(e)}')
|
1492
|
+
job_list = []
|
1493
|
+
# Add user field to jobs
|
1494
|
+
for job in job_list:
|
1495
|
+
job['user'] = user
|
1496
|
+
all_jobs.extend(job_list)
|
1497
|
+
# Reconcile cluster state between managed jobs and clusters:
|
1498
|
+
# To maintain a clear separation between regular SkyPilot clusters
|
1499
|
+
# and those from managed jobs, we need to exclude the latter from
|
1500
|
+
# the main cluster list.
|
1501
|
+
# We do this by reconstructing managed job cluster names from each
|
1502
|
+
# job's name and ID. We then use this set to filter out managed
|
1503
|
+
# clusters from the main cluster list. This is necessary because there
|
1504
|
+
# are no identifiers distinguishing clusters from managed jobs from
|
1505
|
+
# regular clusters.
|
1506
|
+
managed_job_cluster_names = set()
|
1507
|
+
for job in all_jobs:
|
1508
|
+
# Managed job cluster name is <job_name>-<job_id>
|
1509
|
+
managed_cluster_name = f'{job["job_name"]}-{job["job_id"]}'
|
1510
|
+
managed_job_cluster_names.add(managed_cluster_name)
|
1511
|
+
unmanaged_clusters = [
|
1512
|
+
c for c in all_clusters
|
1513
|
+
if c['cluster_name'] not in managed_job_cluster_names
|
1514
|
+
]
|
1515
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1516
|
+
f'Kubernetes cluster state (context: {context})'
|
1517
|
+
f'{colorama.Style.RESET_ALL}')
|
1518
|
+
status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
|
1519
|
+
show_all)
|
1520
|
+
if all_jobs:
|
1521
|
+
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1522
|
+
f'Managed jobs'
|
1523
|
+
f'{colorama.Style.RESET_ALL}')
|
1524
|
+
msg = managed_jobs.format_job_table(all_jobs, show_all=show_all)
|
1525
|
+
click.echo(msg)
|
1526
|
+
if serve_controllers:
|
1527
|
+
# TODO: Parse serve controllers and show services separately.
|
1528
|
+
# Currently we show a hint that services are shown as clusters.
|
1529
|
+
click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
|
1530
|
+
'shown in the "SkyPilot clusters" section.'
|
1531
|
+
f'{colorama.Style.RESET_ALL}')
|
1532
|
+
|
1533
|
+
|
1461
1534
|
@cli.command()
|
1462
1535
|
@click.option('--all',
|
1463
1536
|
'-a',
|
@@ -1503,6 +1576,14 @@ def _get_services(service_names: Optional[List[str]],
|
|
1503
1576
|
is_flag=True,
|
1504
1577
|
required=False,
|
1505
1578
|
help='Also show sky serve services, if any.')
|
1579
|
+
@click.option(
|
1580
|
+
'--kubernetes',
|
1581
|
+
'--k8s',
|
1582
|
+
default=False,
|
1583
|
+
is_flag=True,
|
1584
|
+
required=False,
|
1585
|
+
help='[Experimental] Show all SkyPilot resources (including from other '
|
1586
|
+
'users) in the current Kubernetes context.')
|
1506
1587
|
@click.argument('clusters',
|
1507
1588
|
required=False,
|
1508
1589
|
type=str,
|
@@ -1512,7 +1593,7 @@ def _get_services(service_names: Optional[List[str]],
|
|
1512
1593
|
# pylint: disable=redefined-builtin
|
1513
1594
|
def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
1514
1595
|
endpoint: Optional[int], show_managed_jobs: bool,
|
1515
|
-
show_services: bool, clusters: List[str]):
|
1596
|
+
show_services: bool, kubernetes: bool, clusters: List[str]):
|
1516
1597
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1517
1598
|
"""Show clusters.
|
1518
1599
|
|
@@ -1571,6 +1652,9 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1571
1652
|
or for autostop-enabled clusters, use ``--refresh`` to query the latest
|
1572
1653
|
cluster statuses from the cloud providers.
|
1573
1654
|
"""
|
1655
|
+
if kubernetes:
|
1656
|
+
_status_kubernetes(all)
|
1657
|
+
return
|
1574
1658
|
# Using a pool with 2 worker to run the managed job query and sky serve
|
1575
1659
|
# service query in parallel to speed up. The pool provides a AsyncResult
|
1576
1660
|
# object that can be used as a future.
|
@@ -3113,7 +3197,12 @@ def show_gpus(
|
|
3113
3197
|
print_section_titles = False
|
3114
3198
|
# If cloud is kubernetes, we want to show real-time capacity
|
3115
3199
|
if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes):
|
3116
|
-
|
3200
|
+
if region:
|
3201
|
+
context = region
|
3202
|
+
else:
|
3203
|
+
# If region is not specified, we use the current context
|
3204
|
+
context = (
|
3205
|
+
kubernetes_utils.get_current_kube_config_context_name())
|
3117
3206
|
try:
|
3118
3207
|
# If --cloud kubernetes is not specified, we want to catch
|
3119
3208
|
# the case where no GPUs are available on the cluster and
|
@@ -3128,7 +3217,7 @@ def show_gpus(
|
|
3128
3217
|
else:
|
3129
3218
|
print_section_titles = True
|
3130
3219
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3131
|
-
f'Kubernetes GPUs (
|
3220
|
+
f'Kubernetes GPUs (context: {context})'
|
3132
3221
|
f'{colorama.Style.RESET_ALL}\n')
|
3133
3222
|
yield from k8s_realtime_table.get_string()
|
3134
3223
|
k8s_node_table = _get_kubernetes_node_info_table(context)
|
sky/data/storage_utils.py
CHANGED
@@ -12,7 +12,6 @@ from sky import sky_logging
|
|
12
12
|
from sky.skylet import constants
|
13
13
|
from sky.utils import common_utils
|
14
14
|
from sky.utils import log_utils
|
15
|
-
from sky.utils.cli_utils import status_utils
|
16
15
|
|
17
16
|
logger = sky_logging.init_logger(__name__)
|
18
17
|
|
@@ -22,6 +21,8 @@ _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG = (
|
|
22
21
|
'to the cloud storage for {path!r}'
|
23
22
|
'due to the following error: {error_msg!r}')
|
24
23
|
|
24
|
+
_LAST_USE_TRUNC_LENGTH = 25
|
25
|
+
|
25
26
|
|
26
27
|
def format_storage_table(storages: List[Dict[str, Any]],
|
27
28
|
show_all: bool = False) -> str:
|
@@ -46,8 +47,8 @@ def format_storage_table(storages: List[Dict[str, Any]],
|
|
46
47
|
if show_all:
|
47
48
|
command = row['last_use']
|
48
49
|
else:
|
49
|
-
command =
|
50
|
-
|
50
|
+
command = common_utils.truncate_long_string(row['last_use'],
|
51
|
+
_LAST_USE_TRUNC_LENGTH)
|
51
52
|
storage_table.add_row([
|
52
53
|
# NAME
|
53
54
|
row['name'],
|
sky/jobs/__init__.py
CHANGED
@@ -8,6 +8,7 @@ from sky.jobs.constants import JOBS_TASK_YAML_PREFIX
|
|
8
8
|
from sky.jobs.core import cancel
|
9
9
|
from sky.jobs.core import launch
|
10
10
|
from sky.jobs.core import queue
|
11
|
+
from sky.jobs.core import queue_from_kubernetes_pod
|
11
12
|
from sky.jobs.core import tail_logs
|
12
13
|
from sky.jobs.recovery_strategy import DEFAULT_RECOVERY_STRATEGY
|
13
14
|
from sky.jobs.recovery_strategy import RECOVERY_STRATEGIES
|
@@ -34,6 +35,7 @@ __all__ = [
|
|
34
35
|
'cancel',
|
35
36
|
'launch',
|
36
37
|
'queue',
|
38
|
+
'queue_from_kubernetes_pod',
|
37
39
|
'tail_logs',
|
38
40
|
# utils
|
39
41
|
'ManagedJobCodeGen',
|
sky/jobs/core.py
CHANGED
@@ -9,6 +9,7 @@ import colorama
|
|
9
9
|
import sky
|
10
10
|
from sky import backends
|
11
11
|
from sky import exceptions
|
12
|
+
from sky import provision as provision_lib
|
12
13
|
from sky import sky_logging
|
13
14
|
from sky import status_lib
|
14
15
|
from sky import task as task_lib
|
@@ -16,6 +17,7 @@ from sky.backends import backend_utils
|
|
16
17
|
from sky.clouds.service_catalog import common as service_catalog_common
|
17
18
|
from sky.jobs import constants as managed_job_constants
|
18
19
|
from sky.jobs import utils as managed_job_utils
|
20
|
+
from sky.provision import common
|
19
21
|
from sky.skylet import constants as skylet_constants
|
20
22
|
from sky.usage import usage_lib
|
21
23
|
from sky.utils import admin_policy_utils
|
@@ -138,6 +140,82 @@ def launch(
|
|
138
140
|
_disable_controller_check=True)
|
139
141
|
|
140
142
|
|
143
|
+
def queue_from_kubernetes_pod(
|
144
|
+
pod_name: str,
|
145
|
+
context: Optional[str] = None,
|
146
|
+
skip_finished: bool = False) -> List[Dict[str, Any]]:
|
147
|
+
"""Gets the jobs queue from a specific controller pod.
|
148
|
+
|
149
|
+
Args:
|
150
|
+
pod_name (str): The name of the controller pod to query for jobs.
|
151
|
+
context (Optional[str]): The Kubernetes context to use. If None, the
|
152
|
+
current context is used.
|
153
|
+
skip_finished (bool): If True, does not return finished jobs.
|
154
|
+
|
155
|
+
Returns:
|
156
|
+
[
|
157
|
+
{
|
158
|
+
'job_id': int,
|
159
|
+
'job_name': str,
|
160
|
+
'resources': str,
|
161
|
+
'submitted_at': (float) timestamp of submission,
|
162
|
+
'end_at': (float) timestamp of end,
|
163
|
+
'duration': (float) duration in seconds,
|
164
|
+
'recovery_count': (int) Number of retries,
|
165
|
+
'status': (sky.jobs.ManagedJobStatus) of the job,
|
166
|
+
'cluster_resources': (str) resources of the cluster,
|
167
|
+
'region': (str) region of the cluster,
|
168
|
+
}
|
169
|
+
]
|
170
|
+
|
171
|
+
Raises:
|
172
|
+
RuntimeError: If there's an error fetching the managed jobs.
|
173
|
+
"""
|
174
|
+
# Create dummy cluster info to get the command runner.
|
175
|
+
provider_config = {'context': context}
|
176
|
+
instances = {
|
177
|
+
pod_name: [
|
178
|
+
common.InstanceInfo(instance_id=pod_name,
|
179
|
+
internal_ip='',
|
180
|
+
external_ip='',
|
181
|
+
tags={})
|
182
|
+
]
|
183
|
+
} # Internal IP is not required for Kubernetes
|
184
|
+
cluster_info = common.ClusterInfo(provider_name='kubernetes',
|
185
|
+
head_instance_id=pod_name,
|
186
|
+
provider_config=provider_config,
|
187
|
+
instances=instances)
|
188
|
+
managed_jobs_runner = provision_lib.get_command_runners(
|
189
|
+
'kubernetes', cluster_info)[0]
|
190
|
+
|
191
|
+
code = managed_job_utils.ManagedJobCodeGen.get_job_table()
|
192
|
+
returncode, job_table_payload, stderr = managed_jobs_runner.run(
|
193
|
+
code,
|
194
|
+
require_outputs=True,
|
195
|
+
separate_stderr=True,
|
196
|
+
stream_logs=False,
|
197
|
+
)
|
198
|
+
try:
|
199
|
+
subprocess_utils.handle_returncode(returncode,
|
200
|
+
code,
|
201
|
+
'Failed to fetch managed jobs',
|
202
|
+
job_table_payload + stderr,
|
203
|
+
stream_logs=False)
|
204
|
+
except exceptions.CommandError as e:
|
205
|
+
raise RuntimeError(str(e)) from e
|
206
|
+
|
207
|
+
jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
|
208
|
+
if skip_finished:
|
209
|
+
# Filter out the finished jobs. If a multi-task job is partially
|
210
|
+
# finished, we will include all its tasks.
|
211
|
+
non_finished_tasks = list(
|
212
|
+
filter(lambda job: not job['status'].is_terminal(), jobs))
|
213
|
+
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
214
|
+
jobs = list(
|
215
|
+
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
216
|
+
return jobs
|
217
|
+
|
218
|
+
|
141
219
|
@usage_lib.entrypoint
|
142
220
|
def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
|
143
221
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
sky/jobs/utils.py
CHANGED
@@ -599,11 +599,20 @@ def format_job_table(
|
|
599
599
|
a list of "rows" (each of which is a list of str).
|
600
600
|
"""
|
601
601
|
jobs = collections.defaultdict(list)
|
602
|
+
# Check if the tasks have user information.
|
603
|
+
tasks_have_user = any([task.get('user') for task in tasks])
|
604
|
+
if max_jobs and tasks_have_user:
|
605
|
+
raise ValueError('max_jobs is not supported when tasks have user info.')
|
606
|
+
|
607
|
+
def get_hash(task):
|
608
|
+
if tasks_have_user:
|
609
|
+
return (task['user'], task['job_id'])
|
610
|
+
return task['job_id']
|
611
|
+
|
602
612
|
for task in tasks:
|
603
613
|
# The tasks within the same job_id are already sorted
|
604
614
|
# by the task_id.
|
605
|
-
jobs[task
|
606
|
-
jobs = dict(jobs)
|
615
|
+
jobs[get_hash(task)].append(task)
|
607
616
|
|
608
617
|
status_counts: Dict[str, int] = collections.defaultdict(int)
|
609
618
|
for job_tasks in jobs.values():
|
@@ -611,17 +620,14 @@ def format_job_table(
|
|
611
620
|
if not managed_job_status.is_terminal():
|
612
621
|
status_counts[managed_job_status.value] += 1
|
613
622
|
|
614
|
-
if max_jobs is not None:
|
615
|
-
job_ids = sorted(jobs.keys(), reverse=True)
|
616
|
-
job_ids = job_ids[:max_jobs]
|
617
|
-
jobs = {job_id: jobs[job_id] for job_id in job_ids}
|
618
|
-
|
619
623
|
columns = [
|
620
624
|
'ID', 'TASK', 'NAME', 'RESOURCES', 'SUBMITTED', 'TOT. DURATION',
|
621
625
|
'JOB DURATION', '#RECOVERIES', 'STATUS'
|
622
626
|
]
|
623
627
|
if show_all:
|
624
628
|
columns += ['STARTED', 'CLUSTER', 'REGION', 'FAILURE']
|
629
|
+
if tasks_have_user:
|
630
|
+
columns.insert(0, 'USER')
|
625
631
|
job_table = log_utils.create_table(columns)
|
626
632
|
|
627
633
|
status_counts: Dict[str, int] = collections.defaultdict(int)
|
@@ -636,9 +642,9 @@ def format_job_table(
|
|
636
642
|
for task in all_tasks:
|
637
643
|
# The tasks within the same job_id are already sorted
|
638
644
|
# by the task_id.
|
639
|
-
jobs[task
|
645
|
+
jobs[get_hash(task)].append(task)
|
640
646
|
|
641
|
-
for
|
647
|
+
for job_hash, job_tasks in jobs.items():
|
642
648
|
if len(job_tasks) > 1:
|
643
649
|
# Aggregate the tasks into a new row in the table.
|
644
650
|
job_name = job_tasks[0]['job_name']
|
@@ -674,6 +680,7 @@ def format_job_table(
|
|
674
680
|
if not managed_job_status.is_terminal():
|
675
681
|
status_str += f' (task: {current_task_id})'
|
676
682
|
|
683
|
+
job_id = job_hash[1] if tasks_have_user else job_hash
|
677
684
|
job_values = [
|
678
685
|
job_id,
|
679
686
|
'',
|
@@ -692,6 +699,8 @@ def format_job_table(
|
|
692
699
|
'-',
|
693
700
|
failure_reason if failure_reason is not None else '-',
|
694
701
|
])
|
702
|
+
if tasks_have_user:
|
703
|
+
job_values.insert(0, job_tasks[0].get('user', '-'))
|
695
704
|
job_table.add_row(job_values)
|
696
705
|
|
697
706
|
for task in job_tasks:
|
@@ -724,6 +733,8 @@ def format_job_table(
|
|
724
733
|
task['failure_reason']
|
725
734
|
if task['failure_reason'] is not None else '-',
|
726
735
|
])
|
736
|
+
if tasks_have_user:
|
737
|
+
values.insert(0, task.get('user', '-'))
|
727
738
|
job_table.add_row(values)
|
728
739
|
|
729
740
|
if len(job_tasks) > 1:
|
@@ -1998,3 +1998,28 @@ def get_context_from_config(provider_config: Dict[str, Any]) -> Optional[str]:
|
|
1998
1998
|
# we need to use in-cluster auth.
|
1999
1999
|
context = None
|
2000
2000
|
return context
|
2001
|
+
|
2002
|
+
|
2003
|
+
def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
|
2004
|
+
"""Gets all SkyPilot pods in the Kubernetes cluster.
|
2005
|
+
|
2006
|
+
Args:
|
2007
|
+
context: Kubernetes context to use. If None, uses the current context.
|
2008
|
+
|
2009
|
+
Returns:
|
2010
|
+
A list of Kubernetes pod objects.
|
2011
|
+
"""
|
2012
|
+
if context is None:
|
2013
|
+
context = get_current_kube_config_context_name()
|
2014
|
+
|
2015
|
+
try:
|
2016
|
+
pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
2017
|
+
label_selector='skypilot-cluster',
|
2018
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
2019
|
+
except kubernetes.max_retry_error():
|
2020
|
+
raise exceptions.ResourcesUnavailableError(
|
2021
|
+
'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
|
2022
|
+
'Please check if the cluster is healthy and retry. To debug, run: '
|
2023
|
+
'kubectl get pods --selector=skypilot-cluster --all-namespaces'
|
2024
|
+
) from None
|
2025
|
+
return pods
|
@@ -1,12 +1,16 @@
|
|
1
1
|
"""Utilities for sky status."""
|
2
|
-
from typing import Any, Callable, Dict, List, Optional
|
2
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
3
3
|
|
4
4
|
import click
|
5
5
|
import colorama
|
6
6
|
|
7
7
|
from sky import backends
|
8
|
+
from sky import clouds as sky_clouds
|
9
|
+
from sky import resources as resources_lib
|
8
10
|
from sky import status_lib
|
11
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
9
12
|
from sky.skylet import constants
|
13
|
+
from sky.utils import common_utils
|
10
14
|
from sky.utils import log_utils
|
11
15
|
from sky.utils import resources_utils
|
12
16
|
|
@@ -19,25 +23,6 @@ _ClusterRecord = Dict[str, Any]
|
|
19
23
|
_ClusterCostReportRecord = Dict[str, Any]
|
20
24
|
|
21
25
|
|
22
|
-
def truncate_long_string(s: str, max_length: int = 35) -> str:
|
23
|
-
if len(s) <= max_length:
|
24
|
-
return s
|
25
|
-
splits = s.split(' ')
|
26
|
-
if len(splits[0]) > max_length:
|
27
|
-
return splits[0][:max_length] + '...' # Use '…'?
|
28
|
-
# Truncate on word boundary.
|
29
|
-
i = 0
|
30
|
-
total = 0
|
31
|
-
for i, part in enumerate(splits):
|
32
|
-
total += len(part)
|
33
|
-
if total >= max_length:
|
34
|
-
break
|
35
|
-
prefix = ' '.join(splits[:i])
|
36
|
-
if len(prefix) < max_length:
|
37
|
-
prefix += s[len(prefix):max_length]
|
38
|
-
return prefix + '...'
|
39
|
-
|
40
|
-
|
41
26
|
class StatusColumn:
|
42
27
|
"""One column of the displayed cluster table"""
|
43
28
|
|
@@ -54,7 +39,7 @@ class StatusColumn:
|
|
54
39
|
def calc(self, record):
|
55
40
|
val = self.calc_func(record)
|
56
41
|
if self.trunc_length != 0:
|
57
|
-
val = truncate_long_string(str(val), self.trunc_length)
|
42
|
+
val = common_utils.truncate_long_string(str(val), self.trunc_length)
|
58
43
|
return val
|
59
44
|
|
60
45
|
|
@@ -316,3 +301,165 @@ def _get_estimated_cost_for_cost_report(
|
|
316
301
|
return '-'
|
317
302
|
|
318
303
|
return f'$ {cost:.2f}'
|
304
|
+
|
305
|
+
|
306
|
+
def show_kubernetes_cluster_status_table(clusters: List[Any],
|
307
|
+
show_all: bool) -> None:
|
308
|
+
"""Compute cluster table values and display for Kubernetes clusters."""
|
309
|
+
status_columns = [
|
310
|
+
StatusColumn('USER', lambda c: c['user']),
|
311
|
+
StatusColumn('NAME', lambda c: c['cluster_name']),
|
312
|
+
StatusColumn(
|
313
|
+
'LAUNCHED',
|
314
|
+
lambda c: log_utils.readable_time_duration(c['launched_at'])),
|
315
|
+
StatusColumn('RESOURCES',
|
316
|
+
lambda c: c['resources_str'],
|
317
|
+
trunc_length=70 if not show_all else 0),
|
318
|
+
StatusColumn('STATUS', lambda c: c['status'].colored_str()),
|
319
|
+
# TODO(romilb): We should consider adding POD_NAME field here when --all
|
320
|
+
# is passed to help users fetch pod name programmatically.
|
321
|
+
]
|
322
|
+
|
323
|
+
columns = [
|
324
|
+
col.name for col in status_columns if col.show_by_default or show_all
|
325
|
+
]
|
326
|
+
cluster_table = log_utils.create_table(columns)
|
327
|
+
|
328
|
+
# Sort table by user, then by cluster name
|
329
|
+
sorted_clusters = sorted(clusters,
|
330
|
+
key=lambda c: (c['user'], c['cluster_name']))
|
331
|
+
|
332
|
+
for cluster in sorted_clusters:
|
333
|
+
row = []
|
334
|
+
for status_column in status_columns:
|
335
|
+
if status_column.show_by_default or show_all:
|
336
|
+
row.append(status_column.calc(cluster))
|
337
|
+
cluster_table.add_row(row)
|
338
|
+
|
339
|
+
if clusters:
|
340
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
341
|
+
f'SkyPilot clusters'
|
342
|
+
f'{colorama.Style.RESET_ALL}')
|
343
|
+
click.echo(cluster_table)
|
344
|
+
else:
|
345
|
+
click.echo('No SkyPilot resources found in the '
|
346
|
+
'active Kubernetes context.')
|
347
|
+
|
348
|
+
|
349
|
+
def process_skypilot_pods(
|
350
|
+
pods: List[Any],
|
351
|
+
context: Optional[str] = None
|
352
|
+
) -> Tuple[List[Dict[Any, Any]], Dict[str, Any], Dict[str, Any]]:
|
353
|
+
"""Process SkyPilot pods on k8s to extract cluster and controller info.
|
354
|
+
|
355
|
+
Args:
|
356
|
+
pods: List of Kubernetes pod objects.
|
357
|
+
context: Kubernetes context name, used to detect GPU label formatter.
|
358
|
+
|
359
|
+
Returns:
|
360
|
+
A tuple containing:
|
361
|
+
- List of dictionaries with cluster information.
|
362
|
+
- Dictionary of job controller information.
|
363
|
+
- Dictionary of serve controller information.
|
364
|
+
|
365
|
+
Each dictionary contains the following keys:
|
366
|
+
'cluster_name_on_cloud': The cluster_name_on_cloud used by SkyPilot
|
367
|
+
'cluster_name': The cluster name without the user hash
|
368
|
+
'user': The user who created the cluster. Fetched from pod label
|
369
|
+
'status': The cluster status (assumed UP if pod exists)
|
370
|
+
'pods': List of pod objects in the cluster
|
371
|
+
'launched_at': Timestamp of when the cluster was launched
|
372
|
+
'resources': sky.Resources object for the cluster
|
373
|
+
"""
|
374
|
+
clusters: Dict[str, Dict] = {}
|
375
|
+
jobs_controllers: Dict[str, Dict] = {}
|
376
|
+
serve_controllers: Dict[str, Dict] = {}
|
377
|
+
|
378
|
+
for pod in pods:
|
379
|
+
cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
|
380
|
+
cluster_name = cluster_name_on_cloud.rsplit(
|
381
|
+
'-', 1
|
382
|
+
)[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
|
383
|
+
|
384
|
+
# Check if cluster name is name of a controller
|
385
|
+
# Can't use controller_utils.Controllers.from_name(cluster_name)
|
386
|
+
# because hash is different across users
|
387
|
+
if 'controller' in cluster_name_on_cloud:
|
388
|
+
start_time = pod.status.start_time.timestamp()
|
389
|
+
controller_info = {
|
390
|
+
'cluster_name_on_cloud': cluster_name_on_cloud,
|
391
|
+
'cluster_name': cluster_name,
|
392
|
+
'user': pod.metadata.labels.get('skypilot-user'),
|
393
|
+
'status': status_lib.ClusterStatus.UP,
|
394
|
+
# Assuming UP if pod exists
|
395
|
+
'pods': [pod],
|
396
|
+
'launched_at': start_time
|
397
|
+
}
|
398
|
+
if 'sky-jobs-controller' in cluster_name_on_cloud:
|
399
|
+
jobs_controllers[cluster_name_on_cloud] = controller_info
|
400
|
+
elif 'sky-serve-controller' in cluster_name_on_cloud:
|
401
|
+
serve_controllers[cluster_name_on_cloud] = controller_info
|
402
|
+
|
403
|
+
if cluster_name_on_cloud not in clusters:
|
404
|
+
# Parse the start time for the cluster
|
405
|
+
start_time = pod.status.start_time
|
406
|
+
if start_time is not None:
|
407
|
+
start_time = pod.status.start_time.timestamp()
|
408
|
+
|
409
|
+
# Parse resources
|
410
|
+
cpu_request = kubernetes_utils.parse_cpu_or_gpu_resource(
|
411
|
+
pod.spec.containers[0].resources.requests.get('cpu', '0'))
|
412
|
+
memory_request = kubernetes_utils.parse_memory_resource(
|
413
|
+
pod.spec.containers[0].resources.requests.get('memory', '0'),
|
414
|
+
unit='G')
|
415
|
+
gpu_count = kubernetes_utils.parse_cpu_or_gpu_resource(
|
416
|
+
pod.spec.containers[0].resources.requests.get(
|
417
|
+
'nvidia.com/gpu', '0'))
|
418
|
+
if gpu_count > 0:
|
419
|
+
label_formatter, _ = (
|
420
|
+
kubernetes_utils.detect_gpu_label_formatter(context))
|
421
|
+
assert label_formatter is not None, (
|
422
|
+
'GPU label formatter cannot be None if there are pods '
|
423
|
+
f'requesting GPUs: {pod.metadata.name}')
|
424
|
+
gpu_label = label_formatter.get_label_key()
|
425
|
+
# Get GPU name from pod node selector
|
426
|
+
if pod.spec.node_selector is not None:
|
427
|
+
gpu_name = label_formatter.get_accelerator_from_label_value(
|
428
|
+
pod.spec.node_selector.get(gpu_label))
|
429
|
+
|
430
|
+
resources = resources_lib.Resources(
|
431
|
+
cloud=sky_clouds.Kubernetes(),
|
432
|
+
cpus=int(cpu_request),
|
433
|
+
memory=int(memory_request),
|
434
|
+
accelerators=(f'{gpu_name}:{gpu_count}'
|
435
|
+
if gpu_count > 0 else None))
|
436
|
+
if pod.status.phase == 'Pending':
|
437
|
+
# If pod is pending, do not show it in the status
|
438
|
+
continue
|
439
|
+
|
440
|
+
clusters[cluster_name_on_cloud] = {
|
441
|
+
'cluster_name_on_cloud': cluster_name_on_cloud,
|
442
|
+
'cluster_name': cluster_name,
|
443
|
+
'user': pod.metadata.labels.get('skypilot-user'),
|
444
|
+
'status': status_lib.ClusterStatus.UP,
|
445
|
+
'pods': [],
|
446
|
+
'launched_at': start_time,
|
447
|
+
'resources': resources,
|
448
|
+
}
|
449
|
+
else:
|
450
|
+
# Update start_time if this pod started earlier
|
451
|
+
pod_start_time = pod.status.start_time
|
452
|
+
if pod_start_time is not None:
|
453
|
+
pod_start_time = pod_start_time.timestamp()
|
454
|
+
if pod_start_time < clusters[cluster_name_on_cloud][
|
455
|
+
'launched_at']:
|
456
|
+
clusters[cluster_name_on_cloud][
|
457
|
+
'launched_at'] = pod_start_time
|
458
|
+
clusters[cluster_name_on_cloud]['pods'].append(pod)
|
459
|
+
# Update resources_str in clusters:
|
460
|
+
for cluster_name, cluster in clusters.items():
|
461
|
+
resources = cluster['resources']
|
462
|
+
num_pods = len(cluster['pods'])
|
463
|
+
resources_str = f'{num_pods}x {resources}'
|
464
|
+
cluster['resources_str'] = resources_str
|
465
|
+
return list(clusters.values()), jobs_controllers, serve_controllers
|
sky/utils/common_utils.py
CHANGED
@@ -679,3 +679,23 @@ def deprecated_function(
|
|
679
679
|
return func(*args, **kwargs)
|
680
680
|
|
681
681
|
return new_func
|
682
|
+
|
683
|
+
|
684
|
+
def truncate_long_string(s: str, max_length: int = 35) -> str:
|
685
|
+
"""Truncate a string to a maximum length, preserving whole words."""
|
686
|
+
if len(s) <= max_length:
|
687
|
+
return s
|
688
|
+
splits = s.split(' ')
|
689
|
+
if len(splits[0]) > max_length:
|
690
|
+
return splits[0][:max_length] + '...' # Use '…'?
|
691
|
+
# Truncate on word boundary.
|
692
|
+
i = 0
|
693
|
+
total = 0
|
694
|
+
for i, part in enumerate(splits):
|
695
|
+
total += len(part)
|
696
|
+
if total >= max_length:
|
697
|
+
break
|
698
|
+
prefix = ' '.join(splits[:i])
|
699
|
+
if len(prefix) < max_length:
|
700
|
+
prefix += s[len(prefix):max_length]
|
701
|
+
return prefix + '...'
|
{skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241012.dist-info}/RECORD
RENAMED
@@ -1,8 +1,8 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=CLCwcGUqllWQ-4S_tYJ0ytr3zK1fJxBKIt6jjmOTCX4,5854
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=TfKkVnmRIetATSEVQFp-rOOIRGqVig2i8faSQQt_ixA,20974
|
4
4
|
sky/check.py,sha256=jLMIIJrseaZj1_o5WkbaD9XdyXIlCaT6pyAaIFdhdmA,9079
|
5
|
-
sky/cli.py,sha256=
|
5
|
+
sky/cli.py,sha256=aCdE8kQIaCoNFnFiP6Kkrimo8FokdZMCYQXtx71ir7k,210460
|
6
6
|
sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
|
7
7
|
sky/core.py,sha256=YF_6kwj8Ja171Oycb8L25SZ7V_ylZYovFS_jpnjwGo0,34408
|
8
8
|
sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
|
@@ -92,14 +92,14 @@ sky/data/data_transfer.py,sha256=MBmjey9_p2L3IKNKTi8um09SlZe32n4wK3CkVnlTVvo,734
|
|
92
92
|
sky/data/data_utils.py,sha256=-P5GsDH_m4slrCz4vHdgiFezIys8ufzvhEKePJwfjFc,28597
|
93
93
|
sky/data/mounting_utils.py,sha256=44YkYIIgArEkyvxCtfmXXumybrU8bmn1TfLXWv_eldI,11480
|
94
94
|
sky/data/storage.py,sha256=SzO2GefxfoYbKuWO4iRt_9At33s--k4q8htN8xy-vrM,162395
|
95
|
-
sky/data/storage_utils.py,sha256=
|
96
|
-
sky/jobs/__init__.py,sha256=
|
95
|
+
sky/data/storage_utils.py,sha256=LNzowf_t_pXjxa42HE8tFxp-v5G0jg6NZlYidOgAibg,9370
|
96
|
+
sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
|
97
97
|
sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
|
98
98
|
sky/jobs/controller.py,sha256=k28bbicxtML6p1YxSetk-1nhBHPCubpvLWJsh7TtU9c,26701
|
99
|
-
sky/jobs/core.py,sha256=
|
99
|
+
sky/jobs/core.py,sha256=lRcM6ZGtYoAIygff4x-pneV3e7kyFNdA1ts_XrHb-Bg,16784
|
100
100
|
sky/jobs/recovery_strategy.py,sha256=G3iFicEajB-l9FefvcqjqPIazb1X8BJ_AgVmD5bDV2w,25556
|
101
101
|
sky/jobs/state.py,sha256=C6R5Yq7ftBqGPa_71tUjflBMKAaJ1FTTdbgjAwmbJsI,23231
|
102
|
-
sky/jobs/utils.py,sha256=
|
102
|
+
sky/jobs/utils.py,sha256=QOQx31Pr9npiRrW10-Br2fm9YAT_25vspPtVqOHe4ao,36130
|
103
103
|
sky/jobs/dashboard/dashboard.py,sha256=HFShuaxKir97QTeK2x37h6bsY6ncaFaNEg1USZqJPdc,3050
|
104
104
|
sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
|
105
105
|
sky/jobs/dashboard/templates/index.html,sha256=DBKMYEkkJ6sgLYod9ro7drgL8Y_neDsCx_WbwhWDsWM,9837
|
@@ -141,7 +141,7 @@ sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2v
|
|
141
141
|
sky/provision/kubernetes/instance.py,sha256=MdgyGcMUbhsSRdaTRV3IgHmiAj5goCDVhzDZ2PDVs_Y,38323
|
142
142
|
sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
|
143
143
|
sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
|
144
|
-
sky/provision/kubernetes/utils.py,sha256=
|
144
|
+
sky/provision/kubernetes/utils.py,sha256=oJgCrbR8IyTw_uMoM9oxYYYNN_tZ1yzppWNTDM-XqaM,84522
|
145
145
|
sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
|
146
146
|
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
|
147
147
|
sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByxQx4Ez6g,598
|
@@ -246,7 +246,7 @@ sky/utils/admin_policy_utils.py,sha256=zFCu1OFIrZRfQNY0JFRO1502WFfdqZhwAU_QgM4fO
|
|
246
246
|
sky/utils/cluster_yaml_utils.py,sha256=1wRRYqI1kI-eFs1pMW4r_FFjHJ0zamq6v2RRI-Gtx5E,849
|
247
247
|
sky/utils/command_runner.py,sha256=n1B2h_25G_xIz7ICClczR_fkVgy0-MTHc9907Uy_Wvc,34582
|
248
248
|
sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
|
249
|
-
sky/utils/common_utils.py,sha256=
|
249
|
+
sky/utils/common_utils.py,sha256=WqvsdsGRsznKOxkXKRztcX__FoiiCEAKzBU97W3BqG4,24643
|
250
250
|
sky/utils/controller_utils.py,sha256=32pVORm2cd42tg0srxGvmYV0kYTl67IFsw2EdXbdoR8,38042
|
251
251
|
sky/utils/dag_utils.py,sha256=gjGZiJj4_GYsraXX67e6ElvbmOByJcyjSfvVgYZiXvs,5588
|
252
252
|
sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
|
@@ -261,7 +261,7 @@ sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
|
|
261
261
|
sky/utils/ux_utils.py,sha256=318TRunQCyJpJXonfiJ1SVotNA-6K4F2XgMEYjvWvsk,3264
|
262
262
|
sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
|
263
263
|
sky/utils/cli_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
264
|
-
sky/utils/cli_utils/status_utils.py,sha256=
|
264
|
+
sky/utils/cli_utils/status_utils.py,sha256=zIaiGWtEACdczubiSBMTupueZStPt1VqgRrhQilLEaI,17954
|
265
265
|
sky/utils/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
266
266
|
sky/utils/kubernetes/create_cluster.sh,sha256=VLXfazav9XCMQmeKVqhuOQzt2vM6G1jgnvvb0SHUFno,7773
|
267
267
|
sky/utils/kubernetes/delete_cluster.sh,sha256=BSccHF43GyepDNf-FZcenzHzpXXATkVD92vgn1lWPgk,927
|
@@ -273,9 +273,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=KPqp23B-zQ2SZK03jdHeF9fLTog
|
|
273
273
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
274
274
|
sky/utils/kubernetes/rsync_helper.sh,sha256=aRMa_0JRHtXFOPtEg4rFAwR1t57wvvAoGZhn3H3BtGk,1059
|
275
275
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
276
|
-
skypilot_nightly-1.0.0.
|
277
|
-
skypilot_nightly-1.0.0.
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
276
|
+
skypilot_nightly-1.0.0.dev20241012.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
277
|
+
skypilot_nightly-1.0.0.dev20241012.dist-info/METADATA,sha256=fce-5TsxPsr-kg39AVm7aIaEGf-tSusSNfYarjlaXuw,18945
|
278
|
+
skypilot_nightly-1.0.0.dev20241012.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
279
|
+
skypilot_nightly-1.0.0.dev20241012.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
280
|
+
skypilot_nightly-1.0.0.dev20241012.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
281
|
+
skypilot_nightly-1.0.0.dev20241012.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20241011.dist-info → skypilot_nightly-1.0.0.dev20241012.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|