skypilot-nightly 1.0.0.dev20241011__py3-none-any.whl → 1.0.0.dev20241012__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'd0d221fae659ccce73df5684fca53e0719dab814'
8
+ _SKYPILOT_COMMIT_SHA = 'fdd68b209ee74f9282fac5c6834907d5fe72d255'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241011'
38
+ __version__ = '1.0.0.dev20241012'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/cli.py CHANGED
@@ -1458,6 +1458,79 @@ def _get_services(service_names: Optional[List[str]],
1458
1458
  return num_services, msg
1459
1459
 
1460
1460
 
1461
+ def _status_kubernetes(show_all: bool):
1462
+ """Show all SkyPilot resources in the current Kubernetes context.
1463
+
1464
+ Args:
1465
+ show_all (bool): Show all job information (e.g., start time, failures).
1466
+ """
1467
+ context = kubernetes_utils.get_current_kube_config_context_name()
1468
+ try:
1469
+ pods = kubernetes_utils.get_skypilot_pods(context)
1470
+ except exceptions.ResourcesUnavailableError as e:
1471
+ with ux_utils.print_exception_no_traceback():
1472
+ raise ValueError('Failed to get SkyPilot pods from '
1473
+ f'Kubernetes: {str(e)}') from e
1474
+ all_clusters, jobs_controllers, serve_controllers = (
1475
+ status_utils.process_skypilot_pods(pods, context))
1476
+ all_jobs = []
1477
+ with rich_utils.safe_status(
1478
+ '[bold cyan]Checking in-progress managed jobs[/]') as spinner:
1479
+ for i, (_, job_controller_info) in enumerate(jobs_controllers.items()):
1480
+ user = job_controller_info['user']
1481
+ pod = job_controller_info['pods'][0]
1482
+ status_message = ('[bold cyan]Checking managed jobs controller')
1483
+ if len(jobs_controllers) > 1:
1484
+ status_message += f's ({i+1}/{len(jobs_controllers)})'
1485
+ spinner.update(f'{status_message}[/]')
1486
+ try:
1487
+ job_list = managed_jobs.queue_from_kubernetes_pod(
1488
+ pod.metadata.name)
1489
+ except RuntimeError as e:
1490
+ logger.warning('Failed to get managed jobs from controller '
1491
+ f'{pod.metadata.name}: {str(e)}')
1492
+ job_list = []
1493
+ # Add user field to jobs
1494
+ for job in job_list:
1495
+ job['user'] = user
1496
+ all_jobs.extend(job_list)
1497
+ # Reconcile cluster state between managed jobs and clusters:
1498
+ # To maintain a clear separation between regular SkyPilot clusters
1499
+ # and those from managed jobs, we need to exclude the latter from
1500
+ # the main cluster list.
1501
+ # We do this by reconstructing managed job cluster names from each
1502
+ # job's name and ID. We then use this set to filter out managed
1503
+ # clusters from the main cluster list. This is necessary because there
1504
+ # are no identifiers distinguishing clusters from managed jobs from
1505
+ # regular clusters.
1506
+ managed_job_cluster_names = set()
1507
+ for job in all_jobs:
1508
+ # Managed job cluster name is <job_name>-<job_id>
1509
+ managed_cluster_name = f'{job["job_name"]}-{job["job_id"]}'
1510
+ managed_job_cluster_names.add(managed_cluster_name)
1511
+ unmanaged_clusters = [
1512
+ c for c in all_clusters
1513
+ if c['cluster_name'] not in managed_job_cluster_names
1514
+ ]
1515
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1516
+ f'Kubernetes cluster state (context: {context})'
1517
+ f'{colorama.Style.RESET_ALL}')
1518
+ status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
1519
+ show_all)
1520
+ if all_jobs:
1521
+ click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1522
+ f'Managed jobs'
1523
+ f'{colorama.Style.RESET_ALL}')
1524
+ msg = managed_jobs.format_job_table(all_jobs, show_all=show_all)
1525
+ click.echo(msg)
1526
+ if serve_controllers:
1527
+ # TODO: Parse serve controllers and show services separately.
1528
+ # Currently we show a hint that services are shown as clusters.
1529
+ click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
1530
+ 'shown in the "SkyPilot clusters" section.'
1531
+ f'{colorama.Style.RESET_ALL}')
1532
+
1533
+
1461
1534
  @cli.command()
1462
1535
  @click.option('--all',
1463
1536
  '-a',
@@ -1503,6 +1576,14 @@ def _get_services(service_names: Optional[List[str]],
1503
1576
  is_flag=True,
1504
1577
  required=False,
1505
1578
  help='Also show sky serve services, if any.')
1579
+ @click.option(
1580
+ '--kubernetes',
1581
+ '--k8s',
1582
+ default=False,
1583
+ is_flag=True,
1584
+ required=False,
1585
+ help='[Experimental] Show all SkyPilot resources (including from other '
1586
+ 'users) in the current Kubernetes context.')
1506
1587
  @click.argument('clusters',
1507
1588
  required=False,
1508
1589
  type=str,
@@ -1512,7 +1593,7 @@ def _get_services(service_names: Optional[List[str]],
1512
1593
  # pylint: disable=redefined-builtin
1513
1594
  def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
1514
1595
  endpoint: Optional[int], show_managed_jobs: bool,
1515
- show_services: bool, clusters: List[str]):
1596
+ show_services: bool, kubernetes: bool, clusters: List[str]):
1516
1597
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1517
1598
  """Show clusters.
1518
1599
 
@@ -1571,6 +1652,9 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
1571
1652
  or for autostop-enabled clusters, use ``--refresh`` to query the latest
1572
1653
  cluster statuses from the cloud providers.
1573
1654
  """
1655
+ if kubernetes:
1656
+ _status_kubernetes(all)
1657
+ return
1574
1658
  # Using a pool with 2 worker to run the managed job query and sky serve
1575
1659
  # service query in parallel to speed up. The pool provides a AsyncResult
1576
1660
  # object that can be used as a future.
@@ -3113,7 +3197,12 @@ def show_gpus(
3113
3197
  print_section_titles = False
3114
3198
  # If cloud is kubernetes, we want to show real-time capacity
3115
3199
  if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes):
3116
- context = region
3200
+ if region:
3201
+ context = region
3202
+ else:
3203
+ # If region is not specified, we use the current context
3204
+ context = (
3205
+ kubernetes_utils.get_current_kube_config_context_name())
3117
3206
  try:
3118
3207
  # If --cloud kubernetes is not specified, we want to catch
3119
3208
  # the case where no GPUs are available on the cluster and
@@ -3128,7 +3217,7 @@ def show_gpus(
3128
3217
  else:
3129
3218
  print_section_titles = True
3130
3219
  yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3131
- f'Kubernetes GPUs (Context: {context})'
3220
+ f'Kubernetes GPUs (context: {context})'
3132
3221
  f'{colorama.Style.RESET_ALL}\n')
3133
3222
  yield from k8s_realtime_table.get_string()
3134
3223
  k8s_node_table = _get_kubernetes_node_info_table(context)
sky/data/storage_utils.py CHANGED
@@ -12,7 +12,6 @@ from sky import sky_logging
12
12
  from sky.skylet import constants
13
13
  from sky.utils import common_utils
14
14
  from sky.utils import log_utils
15
- from sky.utils.cli_utils import status_utils
16
15
 
17
16
  logger = sky_logging.init_logger(__name__)
18
17
 
@@ -22,6 +21,8 @@ _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG = (
22
21
  'to the cloud storage for {path!r}'
23
22
  'due to the following error: {error_msg!r}')
24
23
 
24
+ _LAST_USE_TRUNC_LENGTH = 25
25
+
25
26
 
26
27
  def format_storage_table(storages: List[Dict[str, Any]],
27
28
  show_all: bool = False) -> str:
@@ -46,8 +47,8 @@ def format_storage_table(storages: List[Dict[str, Any]],
46
47
  if show_all:
47
48
  command = row['last_use']
48
49
  else:
49
- command = status_utils.truncate_long_string(
50
- row['last_use'], status_utils.COMMAND_TRUNC_LENGTH)
50
+ command = common_utils.truncate_long_string(row['last_use'],
51
+ _LAST_USE_TRUNC_LENGTH)
51
52
  storage_table.add_row([
52
53
  # NAME
53
54
  row['name'],
sky/jobs/__init__.py CHANGED
@@ -8,6 +8,7 @@ from sky.jobs.constants import JOBS_TASK_YAML_PREFIX
8
8
  from sky.jobs.core import cancel
9
9
  from sky.jobs.core import launch
10
10
  from sky.jobs.core import queue
11
+ from sky.jobs.core import queue_from_kubernetes_pod
11
12
  from sky.jobs.core import tail_logs
12
13
  from sky.jobs.recovery_strategy import DEFAULT_RECOVERY_STRATEGY
13
14
  from sky.jobs.recovery_strategy import RECOVERY_STRATEGIES
@@ -34,6 +35,7 @@ __all__ = [
34
35
  'cancel',
35
36
  'launch',
36
37
  'queue',
38
+ 'queue_from_kubernetes_pod',
37
39
  'tail_logs',
38
40
  # utils
39
41
  'ManagedJobCodeGen',
sky/jobs/core.py CHANGED
@@ -9,6 +9,7 @@ import colorama
9
9
  import sky
10
10
  from sky import backends
11
11
  from sky import exceptions
12
+ from sky import provision as provision_lib
12
13
  from sky import sky_logging
13
14
  from sky import status_lib
14
15
  from sky import task as task_lib
@@ -16,6 +17,7 @@ from sky.backends import backend_utils
16
17
  from sky.clouds.service_catalog import common as service_catalog_common
17
18
  from sky.jobs import constants as managed_job_constants
18
19
  from sky.jobs import utils as managed_job_utils
20
+ from sky.provision import common
19
21
  from sky.skylet import constants as skylet_constants
20
22
  from sky.usage import usage_lib
21
23
  from sky.utils import admin_policy_utils
@@ -138,6 +140,82 @@ def launch(
138
140
  _disable_controller_check=True)
139
141
 
140
142
 
143
+ def queue_from_kubernetes_pod(
144
+ pod_name: str,
145
+ context: Optional[str] = None,
146
+ skip_finished: bool = False) -> List[Dict[str, Any]]:
147
+ """Gets the jobs queue from a specific controller pod.
148
+
149
+ Args:
150
+ pod_name (str): The name of the controller pod to query for jobs.
151
+ context (Optional[str]): The Kubernetes context to use. If None, the
152
+ current context is used.
153
+ skip_finished (bool): If True, does not return finished jobs.
154
+
155
+ Returns:
156
+ [
157
+ {
158
+ 'job_id': int,
159
+ 'job_name': str,
160
+ 'resources': str,
161
+ 'submitted_at': (float) timestamp of submission,
162
+ 'end_at': (float) timestamp of end,
163
+ 'duration': (float) duration in seconds,
164
+ 'recovery_count': (int) Number of retries,
165
+ 'status': (sky.jobs.ManagedJobStatus) of the job,
166
+ 'cluster_resources': (str) resources of the cluster,
167
+ 'region': (str) region of the cluster,
168
+ }
169
+ ]
170
+
171
+ Raises:
172
+ RuntimeError: If there's an error fetching the managed jobs.
173
+ """
174
+ # Create dummy cluster info to get the command runner.
175
+ provider_config = {'context': context}
176
+ instances = {
177
+ pod_name: [
178
+ common.InstanceInfo(instance_id=pod_name,
179
+ internal_ip='',
180
+ external_ip='',
181
+ tags={})
182
+ ]
183
+ } # Internal IP is not required for Kubernetes
184
+ cluster_info = common.ClusterInfo(provider_name='kubernetes',
185
+ head_instance_id=pod_name,
186
+ provider_config=provider_config,
187
+ instances=instances)
188
+ managed_jobs_runner = provision_lib.get_command_runners(
189
+ 'kubernetes', cluster_info)[0]
190
+
191
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table()
192
+ returncode, job_table_payload, stderr = managed_jobs_runner.run(
193
+ code,
194
+ require_outputs=True,
195
+ separate_stderr=True,
196
+ stream_logs=False,
197
+ )
198
+ try:
199
+ subprocess_utils.handle_returncode(returncode,
200
+ code,
201
+ 'Failed to fetch managed jobs',
202
+ job_table_payload + stderr,
203
+ stream_logs=False)
204
+ except exceptions.CommandError as e:
205
+ raise RuntimeError(str(e)) from e
206
+
207
+ jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
208
+ if skip_finished:
209
+ # Filter out the finished jobs. If a multi-task job is partially
210
+ # finished, we will include all its tasks.
211
+ non_finished_tasks = list(
212
+ filter(lambda job: not job['status'].is_terminal(), jobs))
213
+ non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
214
+ jobs = list(
215
+ filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
216
+ return jobs
217
+
218
+
141
219
  @usage_lib.entrypoint
142
220
  def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
143
221
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
sky/jobs/utils.py CHANGED
@@ -599,11 +599,20 @@ def format_job_table(
599
599
  a list of "rows" (each of which is a list of str).
600
600
  """
601
601
  jobs = collections.defaultdict(list)
602
+ # Check if the tasks have user information.
603
+ tasks_have_user = any([task.get('user') for task in tasks])
604
+ if max_jobs and tasks_have_user:
605
+ raise ValueError('max_jobs is not supported when tasks have user info.')
606
+
607
+ def get_hash(task):
608
+ if tasks_have_user:
609
+ return (task['user'], task['job_id'])
610
+ return task['job_id']
611
+
602
612
  for task in tasks:
603
613
  # The tasks within the same job_id are already sorted
604
614
  # by the task_id.
605
- jobs[task['job_id']].append(task)
606
- jobs = dict(jobs)
615
+ jobs[get_hash(task)].append(task)
607
616
 
608
617
  status_counts: Dict[str, int] = collections.defaultdict(int)
609
618
  for job_tasks in jobs.values():
@@ -611,17 +620,14 @@ def format_job_table(
611
620
  if not managed_job_status.is_terminal():
612
621
  status_counts[managed_job_status.value] += 1
613
622
 
614
- if max_jobs is not None:
615
- job_ids = sorted(jobs.keys(), reverse=True)
616
- job_ids = job_ids[:max_jobs]
617
- jobs = {job_id: jobs[job_id] for job_id in job_ids}
618
-
619
623
  columns = [
620
624
  'ID', 'TASK', 'NAME', 'RESOURCES', 'SUBMITTED', 'TOT. DURATION',
621
625
  'JOB DURATION', '#RECOVERIES', 'STATUS'
622
626
  ]
623
627
  if show_all:
624
628
  columns += ['STARTED', 'CLUSTER', 'REGION', 'FAILURE']
629
+ if tasks_have_user:
630
+ columns.insert(0, 'USER')
625
631
  job_table = log_utils.create_table(columns)
626
632
 
627
633
  status_counts: Dict[str, int] = collections.defaultdict(int)
@@ -636,9 +642,9 @@ def format_job_table(
636
642
  for task in all_tasks:
637
643
  # The tasks within the same job_id are already sorted
638
644
  # by the task_id.
639
- jobs[task['job_id']].append(task)
645
+ jobs[get_hash(task)].append(task)
640
646
 
641
- for job_id, job_tasks in jobs.items():
647
+ for job_hash, job_tasks in jobs.items():
642
648
  if len(job_tasks) > 1:
643
649
  # Aggregate the tasks into a new row in the table.
644
650
  job_name = job_tasks[0]['job_name']
@@ -674,6 +680,7 @@ def format_job_table(
674
680
  if not managed_job_status.is_terminal():
675
681
  status_str += f' (task: {current_task_id})'
676
682
 
683
+ job_id = job_hash[1] if tasks_have_user else job_hash
677
684
  job_values = [
678
685
  job_id,
679
686
  '',
@@ -692,6 +699,8 @@ def format_job_table(
692
699
  '-',
693
700
  failure_reason if failure_reason is not None else '-',
694
701
  ])
702
+ if tasks_have_user:
703
+ job_values.insert(0, job_tasks[0].get('user', '-'))
695
704
  job_table.add_row(job_values)
696
705
 
697
706
  for task in job_tasks:
@@ -724,6 +733,8 @@ def format_job_table(
724
733
  task['failure_reason']
725
734
  if task['failure_reason'] is not None else '-',
726
735
  ])
736
+ if tasks_have_user:
737
+ values.insert(0, task.get('user', '-'))
727
738
  job_table.add_row(values)
728
739
 
729
740
  if len(job_tasks) > 1:
@@ -1998,3 +1998,28 @@ def get_context_from_config(provider_config: Dict[str, Any]) -> Optional[str]:
1998
1998
  # we need to use in-cluster auth.
1999
1999
  context = None
2000
2000
  return context
2001
+
2002
+
2003
+ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
2004
+ """Gets all SkyPilot pods in the Kubernetes cluster.
2005
+
2006
+ Args:
2007
+ context: Kubernetes context to use. If None, uses the current context.
2008
+
2009
+ Returns:
2010
+ A list of Kubernetes pod objects.
2011
+ """
2012
+ if context is None:
2013
+ context = get_current_kube_config_context_name()
2014
+
2015
+ try:
2016
+ pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
2017
+ label_selector='skypilot-cluster',
2018
+ _request_timeout=kubernetes.API_TIMEOUT).items
2019
+ except kubernetes.max_retry_error():
2020
+ raise exceptions.ResourcesUnavailableError(
2021
+ 'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
2022
+ 'Please check if the cluster is healthy and retry. To debug, run: '
2023
+ 'kubectl get pods --selector=skypilot-cluster --all-namespaces'
2024
+ ) from None
2025
+ return pods
@@ -1,12 +1,16 @@
1
1
  """Utilities for sky status."""
2
- from typing import Any, Callable, Dict, List, Optional
2
+ from typing import Any, Callable, Dict, List, Optional, Tuple
3
3
 
4
4
  import click
5
5
  import colorama
6
6
 
7
7
  from sky import backends
8
+ from sky import clouds as sky_clouds
9
+ from sky import resources as resources_lib
8
10
  from sky import status_lib
11
+ from sky.provision.kubernetes import utils as kubernetes_utils
9
12
  from sky.skylet import constants
13
+ from sky.utils import common_utils
10
14
  from sky.utils import log_utils
11
15
  from sky.utils import resources_utils
12
16
 
@@ -19,25 +23,6 @@ _ClusterRecord = Dict[str, Any]
19
23
  _ClusterCostReportRecord = Dict[str, Any]
20
24
 
21
25
 
22
- def truncate_long_string(s: str, max_length: int = 35) -> str:
23
- if len(s) <= max_length:
24
- return s
25
- splits = s.split(' ')
26
- if len(splits[0]) > max_length:
27
- return splits[0][:max_length] + '...' # Use '…'?
28
- # Truncate on word boundary.
29
- i = 0
30
- total = 0
31
- for i, part in enumerate(splits):
32
- total += len(part)
33
- if total >= max_length:
34
- break
35
- prefix = ' '.join(splits[:i])
36
- if len(prefix) < max_length:
37
- prefix += s[len(prefix):max_length]
38
- return prefix + '...'
39
-
40
-
41
26
  class StatusColumn:
42
27
  """One column of the displayed cluster table"""
43
28
 
@@ -54,7 +39,7 @@ class StatusColumn:
54
39
  def calc(self, record):
55
40
  val = self.calc_func(record)
56
41
  if self.trunc_length != 0:
57
- val = truncate_long_string(str(val), self.trunc_length)
42
+ val = common_utils.truncate_long_string(str(val), self.trunc_length)
58
43
  return val
59
44
 
60
45
 
@@ -316,3 +301,165 @@ def _get_estimated_cost_for_cost_report(
316
301
  return '-'
317
302
 
318
303
  return f'$ {cost:.2f}'
304
+
305
+
306
+ def show_kubernetes_cluster_status_table(clusters: List[Any],
307
+ show_all: bool) -> None:
308
+ """Compute cluster table values and display for Kubernetes clusters."""
309
+ status_columns = [
310
+ StatusColumn('USER', lambda c: c['user']),
311
+ StatusColumn('NAME', lambda c: c['cluster_name']),
312
+ StatusColumn(
313
+ 'LAUNCHED',
314
+ lambda c: log_utils.readable_time_duration(c['launched_at'])),
315
+ StatusColumn('RESOURCES',
316
+ lambda c: c['resources_str'],
317
+ trunc_length=70 if not show_all else 0),
318
+ StatusColumn('STATUS', lambda c: c['status'].colored_str()),
319
+ # TODO(romilb): We should consider adding POD_NAME field here when --all
320
+ # is passed to help users fetch pod name programmatically.
321
+ ]
322
+
323
+ columns = [
324
+ col.name for col in status_columns if col.show_by_default or show_all
325
+ ]
326
+ cluster_table = log_utils.create_table(columns)
327
+
328
+ # Sort table by user, then by cluster name
329
+ sorted_clusters = sorted(clusters,
330
+ key=lambda c: (c['user'], c['cluster_name']))
331
+
332
+ for cluster in sorted_clusters:
333
+ row = []
334
+ for status_column in status_columns:
335
+ if status_column.show_by_default or show_all:
336
+ row.append(status_column.calc(cluster))
337
+ cluster_table.add_row(row)
338
+
339
+ if clusters:
340
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
341
+ f'SkyPilot clusters'
342
+ f'{colorama.Style.RESET_ALL}')
343
+ click.echo(cluster_table)
344
+ else:
345
+ click.echo('No SkyPilot resources found in the '
346
+ 'active Kubernetes context.')
347
+
348
+
349
+ def process_skypilot_pods(
350
+ pods: List[Any],
351
+ context: Optional[str] = None
352
+ ) -> Tuple[List[Dict[Any, Any]], Dict[str, Any], Dict[str, Any]]:
353
+ """Process SkyPilot pods on k8s to extract cluster and controller info.
354
+
355
+ Args:
356
+ pods: List of Kubernetes pod objects.
357
+ context: Kubernetes context name, used to detect GPU label formatter.
358
+
359
+ Returns:
360
+ A tuple containing:
361
+ - List of dictionaries with cluster information.
362
+ - Dictionary of job controller information.
363
+ - Dictionary of serve controller information.
364
+
365
+ Each dictionary contains the following keys:
366
+ 'cluster_name_on_cloud': The cluster_name_on_cloud used by SkyPilot
367
+ 'cluster_name': The cluster name without the user hash
368
+ 'user': The user who created the cluster. Fetched from pod label
369
+ 'status': The cluster status (assumed UP if pod exists)
370
+ 'pods': List of pod objects in the cluster
371
+ 'launched_at': Timestamp of when the cluster was launched
372
+ 'resources': sky.Resources object for the cluster
373
+ """
374
+ clusters: Dict[str, Dict] = {}
375
+ jobs_controllers: Dict[str, Dict] = {}
376
+ serve_controllers: Dict[str, Dict] = {}
377
+
378
+ for pod in pods:
379
+ cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
380
+ cluster_name = cluster_name_on_cloud.rsplit(
381
+ '-', 1
382
+ )[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
383
+
384
+ # Check if cluster name is name of a controller
385
+ # Can't use controller_utils.Controllers.from_name(cluster_name)
386
+ # because hash is different across users
387
+ if 'controller' in cluster_name_on_cloud:
388
+ start_time = pod.status.start_time.timestamp()
389
+ controller_info = {
390
+ 'cluster_name_on_cloud': cluster_name_on_cloud,
391
+ 'cluster_name': cluster_name,
392
+ 'user': pod.metadata.labels.get('skypilot-user'),
393
+ 'status': status_lib.ClusterStatus.UP,
394
+ # Assuming UP if pod exists
395
+ 'pods': [pod],
396
+ 'launched_at': start_time
397
+ }
398
+ if 'sky-jobs-controller' in cluster_name_on_cloud:
399
+ jobs_controllers[cluster_name_on_cloud] = controller_info
400
+ elif 'sky-serve-controller' in cluster_name_on_cloud:
401
+ serve_controllers[cluster_name_on_cloud] = controller_info
402
+
403
+ if cluster_name_on_cloud not in clusters:
404
+ # Parse the start time for the cluster
405
+ start_time = pod.status.start_time
406
+ if start_time is not None:
407
+ start_time = pod.status.start_time.timestamp()
408
+
409
+ # Parse resources
410
+ cpu_request = kubernetes_utils.parse_cpu_or_gpu_resource(
411
+ pod.spec.containers[0].resources.requests.get('cpu', '0'))
412
+ memory_request = kubernetes_utils.parse_memory_resource(
413
+ pod.spec.containers[0].resources.requests.get('memory', '0'),
414
+ unit='G')
415
+ gpu_count = kubernetes_utils.parse_cpu_or_gpu_resource(
416
+ pod.spec.containers[0].resources.requests.get(
417
+ 'nvidia.com/gpu', '0'))
418
+ if gpu_count > 0:
419
+ label_formatter, _ = (
420
+ kubernetes_utils.detect_gpu_label_formatter(context))
421
+ assert label_formatter is not None, (
422
+ 'GPU label formatter cannot be None if there are pods '
423
+ f'requesting GPUs: {pod.metadata.name}')
424
+ gpu_label = label_formatter.get_label_key()
425
+ # Get GPU name from pod node selector
426
+ if pod.spec.node_selector is not None:
427
+ gpu_name = label_formatter.get_accelerator_from_label_value(
428
+ pod.spec.node_selector.get(gpu_label))
429
+
430
+ resources = resources_lib.Resources(
431
+ cloud=sky_clouds.Kubernetes(),
432
+ cpus=int(cpu_request),
433
+ memory=int(memory_request),
434
+ accelerators=(f'{gpu_name}:{gpu_count}'
435
+ if gpu_count > 0 else None))
436
+ if pod.status.phase == 'Pending':
437
+ # If pod is pending, do not show it in the status
438
+ continue
439
+
440
+ clusters[cluster_name_on_cloud] = {
441
+ 'cluster_name_on_cloud': cluster_name_on_cloud,
442
+ 'cluster_name': cluster_name,
443
+ 'user': pod.metadata.labels.get('skypilot-user'),
444
+ 'status': status_lib.ClusterStatus.UP,
445
+ 'pods': [],
446
+ 'launched_at': start_time,
447
+ 'resources': resources,
448
+ }
449
+ else:
450
+ # Update start_time if this pod started earlier
451
+ pod_start_time = pod.status.start_time
452
+ if pod_start_time is not None:
453
+ pod_start_time = pod_start_time.timestamp()
454
+ if pod_start_time < clusters[cluster_name_on_cloud][
455
+ 'launched_at']:
456
+ clusters[cluster_name_on_cloud][
457
+ 'launched_at'] = pod_start_time
458
+ clusters[cluster_name_on_cloud]['pods'].append(pod)
459
+ # Update resources_str in clusters:
460
+ for cluster_name, cluster in clusters.items():
461
+ resources = cluster['resources']
462
+ num_pods = len(cluster['pods'])
463
+ resources_str = f'{num_pods}x {resources}'
464
+ cluster['resources_str'] = resources_str
465
+ return list(clusters.values()), jobs_controllers, serve_controllers
sky/utils/common_utils.py CHANGED
@@ -679,3 +679,23 @@ def deprecated_function(
679
679
  return func(*args, **kwargs)
680
680
 
681
681
  return new_func
682
+
683
+
684
+ def truncate_long_string(s: str, max_length: int = 35) -> str:
685
+ """Truncate a string to a maximum length, preserving whole words."""
686
+ if len(s) <= max_length:
687
+ return s
688
+ splits = s.split(' ')
689
+ if len(splits[0]) > max_length:
690
+ return splits[0][:max_length] + '...' # Use '…'?
691
+ # Truncate on word boundary.
692
+ i = 0
693
+ total = 0
694
+ for i, part in enumerate(splits):
695
+ total += len(part)
696
+ if total >= max_length:
697
+ break
698
+ prefix = ' '.join(splits[:i])
699
+ if len(prefix) < max_length:
700
+ prefix += s[len(prefix):max_length]
701
+ return prefix + '...'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241011
3
+ Version: 1.0.0.dev20241012
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -1,8 +1,8 @@
1
- sky/__init__.py,sha256=4R-I1YE1smfwcbO4zosKE8PEWzVuCTgo37088CS77fo,5854
1
+ sky/__init__.py,sha256=CLCwcGUqllWQ-4S_tYJ0ytr3zK1fJxBKIt6jjmOTCX4,5854
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=TfKkVnmRIetATSEVQFp-rOOIRGqVig2i8faSQQt_ixA,20974
4
4
  sky/check.py,sha256=jLMIIJrseaZj1_o5WkbaD9XdyXIlCaT6pyAaIFdhdmA,9079
5
- sky/cli.py,sha256=19YLRhF3p8wOfKoN0vICUJ9itFKDKTyhG3mV8UsnR-U,206326
5
+ sky/cli.py,sha256=aCdE8kQIaCoNFnFiP6Kkrimo8FokdZMCYQXtx71ir7k,210460
6
6
  sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
7
  sky/core.py,sha256=YF_6kwj8Ja171Oycb8L25SZ7V_ylZYovFS_jpnjwGo0,34408
8
8
  sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
@@ -92,14 +92,14 @@ sky/data/data_transfer.py,sha256=MBmjey9_p2L3IKNKTi8um09SlZe32n4wK3CkVnlTVvo,734
92
92
  sky/data/data_utils.py,sha256=-P5GsDH_m4slrCz4vHdgiFezIys8ufzvhEKePJwfjFc,28597
93
93
  sky/data/mounting_utils.py,sha256=44YkYIIgArEkyvxCtfmXXumybrU8bmn1TfLXWv_eldI,11480
94
94
  sky/data/storage.py,sha256=SzO2GefxfoYbKuWO4iRt_9At33s--k4q8htN8xy-vrM,162395
95
- sky/data/storage_utils.py,sha256=Rwj_Pt2Pl0e16dhHXyxiT500CYv1k7SWE_WE2jKepl0,9358
96
- sky/jobs/__init__.py,sha256=9cqFutVlfjQb7t8hzG-ZlQmMlbmfMirn0KNBxIFnJYQ,1398
95
+ sky/data/storage_utils.py,sha256=LNzowf_t_pXjxa42HE8tFxp-v5G0jg6NZlYidOgAibg,9370
96
+ sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
97
97
  sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
98
98
  sky/jobs/controller.py,sha256=k28bbicxtML6p1YxSetk-1nhBHPCubpvLWJsh7TtU9c,26701
99
- sky/jobs/core.py,sha256=Q5ExRWnF7yAYWJxwnB9NfAGBVDNqKYBCrWsypiMLCpY,13637
99
+ sky/jobs/core.py,sha256=lRcM6ZGtYoAIygff4x-pneV3e7kyFNdA1ts_XrHb-Bg,16784
100
100
  sky/jobs/recovery_strategy.py,sha256=G3iFicEajB-l9FefvcqjqPIazb1X8BJ_AgVmD5bDV2w,25556
101
101
  sky/jobs/state.py,sha256=C6R5Yq7ftBqGPa_71tUjflBMKAaJ1FTTdbgjAwmbJsI,23231
102
- sky/jobs/utils.py,sha256=ZB2dJxtJ4hbCRdxHmy8wrmtXIvvGGE80kk5BQTOQWkQ,35653
102
+ sky/jobs/utils.py,sha256=QOQx31Pr9npiRrW10-Br2fm9YAT_25vspPtVqOHe4ao,36130
103
103
  sky/jobs/dashboard/dashboard.py,sha256=HFShuaxKir97QTeK2x37h6bsY6ncaFaNEg1USZqJPdc,3050
104
104
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
105
105
  sky/jobs/dashboard/templates/index.html,sha256=DBKMYEkkJ6sgLYod9ro7drgL8Y_neDsCx_WbwhWDsWM,9837
@@ -141,7 +141,7 @@ sky/provision/kubernetes/config.py,sha256=WEKcFXXhe89bLGAvoMiBvTDxdxkpTIA6ezrj2v
141
141
  sky/provision/kubernetes/instance.py,sha256=MdgyGcMUbhsSRdaTRV3IgHmiAj5goCDVhzDZ2PDVs_Y,38323
142
142
  sky/provision/kubernetes/network.py,sha256=EpNjRQ131CXepqbdkoRKFu4szVrm0oKEpv1l8EgOkjU,12364
143
143
  sky/provision/kubernetes/network_utils.py,sha256=t1FS3K400fetH7cBuRgQJZl5_jEeMshsvsYmnMUcq8k,11399
144
- sky/provision/kubernetes/utils.py,sha256=iULhot4naFOsyzp53x4Q4qpsHXvz5-DMOIFFTR8ap9s,83609
144
+ sky/provision/kubernetes/utils.py,sha256=oJgCrbR8IyTw_uMoM9oxYYYNN_tZ1yzppWNTDM-XqaM,84522
145
145
  sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
146
146
  sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
147
147
  sky/provision/paperspace/__init__.py,sha256=1nbUPWio7UA5gCQkO_rfEDfgXT17u5OtuByxQx4Ez6g,598
@@ -246,7 +246,7 @@ sky/utils/admin_policy_utils.py,sha256=zFCu1OFIrZRfQNY0JFRO1502WFfdqZhwAU_QgM4fO
246
246
  sky/utils/cluster_yaml_utils.py,sha256=1wRRYqI1kI-eFs1pMW4r_FFjHJ0zamq6v2RRI-Gtx5E,849
247
247
  sky/utils/command_runner.py,sha256=n1B2h_25G_xIz7ICClczR_fkVgy0-MTHc9907Uy_Wvc,34582
248
248
  sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
249
- sky/utils/common_utils.py,sha256=O6PlZTCNhbuXOzjuV2DKw43niWE_qPfYZNGhnMtZzQg,24028
249
+ sky/utils/common_utils.py,sha256=WqvsdsGRsznKOxkXKRztcX__FoiiCEAKzBU97W3BqG4,24643
250
250
  sky/utils/controller_utils.py,sha256=32pVORm2cd42tg0srxGvmYV0kYTl67IFsw2EdXbdoR8,38042
251
251
  sky/utils/dag_utils.py,sha256=gjGZiJj4_GYsraXX67e6ElvbmOByJcyjSfvVgYZiXvs,5588
252
252
  sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
@@ -261,7 +261,7 @@ sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
261
261
  sky/utils/ux_utils.py,sha256=318TRunQCyJpJXonfiJ1SVotNA-6K4F2XgMEYjvWvsk,3264
262
262
  sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
263
263
  sky/utils/cli_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
264
- sky/utils/cli_utils/status_utils.py,sha256=9odkfXiXLMD14XJsqve6sGvHpe7ThHXpC6ic9RYtOqY,11032
264
+ sky/utils/cli_utils/status_utils.py,sha256=zIaiGWtEACdczubiSBMTupueZStPt1VqgRrhQilLEaI,17954
265
265
  sky/utils/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
266
266
  sky/utils/kubernetes/create_cluster.sh,sha256=VLXfazav9XCMQmeKVqhuOQzt2vM6G1jgnvvb0SHUFno,7773
267
267
  sky/utils/kubernetes/delete_cluster.sh,sha256=BSccHF43GyepDNf-FZcenzHzpXXATkVD92vgn1lWPgk,927
@@ -273,9 +273,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=KPqp23B-zQ2SZK03jdHeF9fLTog
273
273
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
274
274
  sky/utils/kubernetes/rsync_helper.sh,sha256=aRMa_0JRHtXFOPtEg4rFAwR1t57wvvAoGZhn3H3BtGk,1059
275
275
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
276
- skypilot_nightly-1.0.0.dev20241011.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
277
- skypilot_nightly-1.0.0.dev20241011.dist-info/METADATA,sha256=poy78UzKBy2gq6bEdz2LA7LJym4BgdYIdeMv9zk_Jaw,18945
278
- skypilot_nightly-1.0.0.dev20241011.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
279
- skypilot_nightly-1.0.0.dev20241011.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
280
- skypilot_nightly-1.0.0.dev20241011.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
281
- skypilot_nightly-1.0.0.dev20241011.dist-info/RECORD,,
276
+ skypilot_nightly-1.0.0.dev20241012.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
277
+ skypilot_nightly-1.0.0.dev20241012.dist-info/METADATA,sha256=fce-5TsxPsr-kg39AVm7aIaEGf-tSusSNfYarjlaXuw,18945
278
+ skypilot_nightly-1.0.0.dev20241012.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
279
+ skypilot_nightly-1.0.0.dev20241012.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
280
+ skypilot_nightly-1.0.0.dev20241012.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
281
+ skypilot_nightly-1.0.0.dev20241012.dist-info/RECORD,,