skypilot-nightly 1.0.0.dev20250311__py3-none-any.whl → 1.0.0.dev20250313__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/gcp.py +7 -0
  3. sky/adaptors/nebius.py +11 -1
  4. sky/backends/backend_utils.py +38 -15
  5. sky/backends/cloud_vm_ray_backend.py +17 -52
  6. sky/cli.py +26 -13
  7. sky/client/cli.py +26 -13
  8. sky/client/sdk.py +2 -9
  9. sky/clouds/gcp.py +4 -1
  10. sky/clouds/nebius.py +8 -6
  11. sky/data/storage.py +16 -0
  12. sky/exceptions.py +11 -3
  13. sky/provision/kubernetes/utils.py +10 -1
  14. sky/server/common.py +16 -0
  15. sky/server/requests/event_loop.py +31 -0
  16. sky/server/requests/executor.py +50 -22
  17. sky/server/requests/preconditions.py +174 -0
  18. sky/server/requests/requests.py +43 -4
  19. sky/server/server.py +29 -8
  20. sky/server/stream_utils.py +9 -6
  21. sky/server/uvicorn.py +81 -0
  22. sky/setup_files/dependencies.py +4 -1
  23. sky/utils/accelerator_registry.py +1 -1
  24. sky/utils/controller_utils.py +10 -0
  25. sky/utils/subprocess_utils.py +56 -1
  26. {skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250313.dist-info}/METADATA +3 -3
  27. {skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250313.dist-info}/RECORD +31 -28
  28. {skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250313.dist-info}/LICENSE +0 -0
  29. {skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250313.dist-info}/WHEEL +0 -0
  30. {skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250313.dist-info}/entry_points.txt +0 -0
  31. {skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250313.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '439de1a24a6f0a9601051ecdc3e565308bac442a'
8
+ _SKYPILOT_COMMIT_SHA = '6044bbfe8712221e8d0da08ce8ce7ec36ab66caf'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250311'
38
+ __version__ = '1.0.0.dev20250313'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/adaptors/gcp.py CHANGED
@@ -68,6 +68,13 @@ def credential_error_exception():
68
68
  return exceptions.DefaultCredentialsError
69
69
 
70
70
 
71
+ @common.load_lazy_modules(_LAZY_MODULES)
72
+ def gcp_auth_refresh_error_exception():
73
+ """GCP auth refresh error exception."""
74
+ from google.auth import exceptions
75
+ return exceptions.RefreshError
76
+
77
+
71
78
  @common.load_lazy_modules(_LAZY_MODULES)
72
79
  def get_credentials(cred_type: str, credentials_field: str):
73
80
  """Get GCP credentials."""
sky/adaptors/nebius.py CHANGED
@@ -6,9 +6,11 @@ from sky.adaptors import common
6
6
  NEBIUS_TENANT_ID_FILENAME = 'NEBIUS_TENANT_ID.txt'
7
7
  NEBIUS_IAM_TOKEN_FILENAME = 'NEBIUS_IAM_TOKEN.txt'
8
8
  NEBIUS_PROJECT_ID_FILENAME = 'NEBIUS_PROJECT_ID.txt'
9
+ NEBIUS_CREDENTIALS_FILENAME = 'credentials.json'
9
10
  NEBIUS_TENANT_ID_PATH = '~/.nebius/' + NEBIUS_TENANT_ID_FILENAME
10
11
  NEBIUS_IAM_TOKEN_PATH = '~/.nebius/' + NEBIUS_IAM_TOKEN_FILENAME
11
12
  NEBIUS_PROJECT_ID_PATH = '~/.nebius/' + NEBIUS_PROJECT_ID_FILENAME
13
+ NEBIUS_CREDENTIALS_PATH = '~/.nebius/' + NEBIUS_CREDENTIALS_FILENAME
12
14
 
13
15
  MAX_RETRIES_TO_DISK_CREATE = 120
14
16
  MAX_RETRIES_TO_INSTANCE_STOP = 120
@@ -72,6 +74,11 @@ def get_iam_token():
72
74
  return _iam_token
73
75
 
74
76
 
77
+ def is_token_or_cred_file_exist():
78
+ return (os.path.exists(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH)) or
79
+ os.path.exists(os.path.expanduser(NEBIUS_CREDENTIALS_PATH)))
80
+
81
+
75
82
  def get_project_id():
76
83
  global _project_id
77
84
  if _project_id is None:
@@ -97,4 +104,7 @@ def get_tenant_id():
97
104
 
98
105
 
99
106
  def sdk():
100
- return nebius.sdk.SDK(credentials=get_iam_token())
107
+ if get_iam_token() is not None:
108
+ return nebius.sdk.SDK(credentials=get_iam_token())
109
+ return nebius.sdk.SDK(
110
+ credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
@@ -1802,6 +1802,21 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1802
1802
  status == status_lib.ClusterStatus.UP for status in node_statuses) and
1803
1803
  len(node_statuses) == handle.launched_nodes)
1804
1804
 
1805
+ def get_node_counts_from_ray_status(
1806
+ runner: command_runner.CommandRunner) -> Tuple[int, int, str, str]:
1807
+ rc, output, stderr = runner.run(
1808
+ instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
1809
+ stream_logs=False,
1810
+ require_outputs=True,
1811
+ separate_stderr=True)
1812
+ if rc:
1813
+ raise RuntimeError(
1814
+ f'Refreshing status ({cluster_name!r}): Failed to check '
1815
+ f'ray cluster\'s healthiness with '
1816
+ f'{instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND}.\n'
1817
+ f'-- stdout --\n{output}\n-- stderr --\n{stderr}')
1818
+ return (*_count_healthy_nodes_from_ray(output), output, stderr)
1819
+
1805
1820
  def run_ray_status_to_check_ray_cluster_healthy() -> bool:
1806
1821
  try:
1807
1822
  # NOTE: fetching the IPs is very slow as it calls into
@@ -1822,26 +1837,34 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1822
1837
  raise exceptions.FetchClusterInfoError(
1823
1838
  reason=exceptions.FetchClusterInfoError.Reason.HEAD)
1824
1839
  head_runner = runners[0]
1825
- rc, output, stderr = head_runner.run(
1826
- instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
1827
- stream_logs=False,
1828
- require_outputs=True,
1829
- separate_stderr=True)
1830
- if rc:
1831
- raise RuntimeError(
1832
- f'Refreshing status ({cluster_name!r}): Failed to check '
1833
- f'ray cluster\'s healthiness with '
1834
- f'{instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND}.\n'
1835
- f'-- stdout --\n{output}\n-- stderr --\n{stderr}')
1836
1840
 
1837
- ready_head, ready_workers = _count_healthy_nodes_from_ray(output)
1838
1841
  total_nodes = handle.launched_nodes * handle.num_ips_per_node
1839
- if ready_head + ready_workers == total_nodes:
1840
- return True
1842
+
1843
+ for i in range(5):
1844
+ ready_head, ready_workers, output, stderr = (
1845
+ get_node_counts_from_ray_status(head_runner))
1846
+ if ready_head + ready_workers == total_nodes:
1847
+ return True
1848
+ logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
1849
+ f'{i}: ray status not showing all nodes '
1850
+ f'({ready_head + ready_workers}/{total_nodes});\n'
1851
+ f'output:\n{output}\nstderr:\n{stderr}')
1852
+
1853
+ # If cluster JUST started, maybe not all the nodes have shown
1854
+ # up. Try again for a few seconds.
1855
+ # Note: We are okay with this performance hit because it's very
1856
+ # rare to normally hit this case. It requires:
1857
+ # - All the instances in the cluster are up on the cloud side
1858
+ # (not preempted), but
1859
+ # - The ray cluster is somehow degraded so not all instances are
1860
+ # showing up
1861
+ time.sleep(1)
1862
+
1841
1863
  raise RuntimeError(
1842
1864
  f'Refreshing status ({cluster_name!r}): ray status not showing '
1843
1865
  f'all nodes ({ready_head + ready_workers}/'
1844
- f'{total_nodes}); output: {output}; stderr: {stderr}')
1866
+ f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
1867
+
1845
1868
  except exceptions.FetchClusterInfoError:
1846
1869
  logger.debug(
1847
1870
  f'Refreshing status ({cluster_name!r}) failed to get IPs.')
@@ -772,32 +772,6 @@ class FailoverCloudErrorHandlerV1:
772
772
  setattr(e, 'detailed_reason', detailed_reason)
773
773
  raise e
774
774
 
775
- @staticmethod
776
- def _lambda_handler(blocked_resources: Set['resources_lib.Resources'],
777
- launchable_resources: 'resources_lib.Resources',
778
- region: 'clouds.Region',
779
- zones: Optional[List['clouds.Zone']], stdout: str,
780
- stderr: str):
781
- del region, zones # Unused.
782
- errors = FailoverCloudErrorHandlerV1._handle_errors(
783
- stdout,
784
- stderr,
785
- is_error_str_known=lambda x: 'LambdaCloudError:' in x.strip())
786
- messages = '\n '.join(errors)
787
- style = colorama.Style
788
- logger.warning(f' {style.DIM}{messages}{style.RESET_ALL}')
789
- _add_to_blocked_resources(blocked_resources,
790
- launchable_resources.copy(zone=None))
791
-
792
- # Sometimes, LambdaCloudError will list available regions.
793
- for e in errors:
794
- if e.find('Regions with capacity available:') != -1:
795
- for r in service_catalog.regions('lambda'):
796
- if e.find(r.name) == -1:
797
- _add_to_blocked_resources(
798
- blocked_resources,
799
- launchable_resources.copy(region=r.name, zone=None))
800
-
801
775
  @staticmethod
802
776
  def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
803
777
  launchable_resources: 'resources_lib.Resources',
@@ -846,32 +820,6 @@ class FailoverCloudErrorHandlerV1:
846
820
  _add_to_blocked_resources(blocked_resources,
847
821
  launchable_resources.copy(zone=zone.name))
848
822
 
849
- # Apr, 2023 by Hysun(hysun.he@oracle.com): Added support for OCI
850
- @staticmethod
851
- def _oci_handler(blocked_resources: Set['resources_lib.Resources'],
852
- launchable_resources: 'resources_lib.Resources',
853
- region: 'clouds.Region',
854
- zones: Optional[List['clouds.Zone']], stdout: str,
855
- stderr: str):
856
- known_service_errors = [
857
- 'NotAuthorizedOrNotFound', 'CannotParseRequest', 'InternalError',
858
- 'LimitExceeded', 'NotAuthenticated'
859
- ]
860
- errors = FailoverCloudErrorHandlerV1._handle_errors(
861
- stdout, stderr, lambda x: 'VcnSubnetNotFound' in x.strip() or
862
- ('oci.exceptions.ServiceError' in x.strip() and any(
863
- known_err in x.strip() for known_err in known_service_errors)))
864
- logger.warning(f'Got error(s) in {region.name}:')
865
- messages = '\n\t'.join(errors)
866
- style = colorama.Style
867
- logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
868
-
869
- if zones is not None:
870
- for zone in zones:
871
- _add_to_blocked_resources(
872
- blocked_resources,
873
- launchable_resources.copy(zone=zone.name))
874
-
875
823
  @staticmethod
876
824
  def update_blocklist_on_error(
877
825
  blocked_resources: Set['resources_lib.Resources'],
@@ -1123,6 +1071,23 @@ class FailoverCloudErrorHandlerV2:
1123
1071
  blocked_resources,
1124
1072
  launchable_resources.copy(zone=zone.name))
1125
1073
 
1074
+ @staticmethod
1075
+ def _lambda_handler(blocked_resources: Set['resources_lib.Resources'],
1076
+ launchable_resources: 'resources_lib.Resources',
1077
+ region: 'clouds.Region',
1078
+ zones: Optional[List['clouds.Zone']], error: Exception):
1079
+ output = str(error)
1080
+ # Sometimes, lambda cloud error will list available regions.
1081
+ if output.find('Regions with capacity available:') != -1:
1082
+ for r in service_catalog.regions('lambda'):
1083
+ if output.find(r.name) == -1:
1084
+ _add_to_blocked_resources(
1085
+ blocked_resources,
1086
+ launchable_resources.copy(region=r.name, zone=None))
1087
+ else:
1088
+ FailoverCloudErrorHandlerV2._default_handler(
1089
+ blocked_resources, launchable_resources, region, zones, error)
1090
+
1126
1091
  @staticmethod
1127
1092
  def _default_handler(blocked_resources: Set['resources_lib.Resources'],
1128
1093
  launchable_resources: 'resources_lib.Resources',
sky/cli.py CHANGED
@@ -100,6 +100,7 @@ an autogenerated name."""
100
100
  # The maximum number of in-progress managed jobs to show in the status
101
101
  # command.
102
102
  _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5
103
+ _NUM_MANAGED_JOBS_TO_SHOW = 50
103
104
 
104
105
  _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
105
106
  '{cluster_num} cluster{plural} {verb}. Please specify {cause} '
@@ -1389,16 +1390,16 @@ def _handle_jobs_queue_request(
1389
1390
  request_id: str,
1390
1391
  show_all: bool,
1391
1392
  show_user: bool,
1392
- limit_num_jobs_to_show: bool = False,
1393
+ max_num_jobs_to_show: Optional[int],
1393
1394
  is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1394
1395
  """Get the in-progress managed jobs.
1395
1396
 
1396
1397
  Args:
1397
1398
  show_all: Show all information of each job (e.g., region, price).
1398
1399
  show_user: Show the user who submitted the job.
1399
- limit_num_jobs_to_show: If True, limit the number of jobs to show to
1400
- _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS, which is mainly used by
1401
- `sky status`.
1400
+ max_num_jobs_to_show: If not None, limit the number of jobs to show to
1401
+ this number, which is mainly used by `sky status`
1402
+ and `sky jobs queue`.
1402
1403
  is_called_by_user: If this function is called by user directly, or an
1403
1404
  internal call.
1404
1405
 
@@ -1459,12 +1460,10 @@ def _handle_jobs_queue_request(
1459
1460
  msg += ('Failed to query managed jobs: '
1460
1461
  f'{common_utils.format_exception(e, use_bracket=True)}')
1461
1462
  else:
1462
- max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS
1463
- if limit_num_jobs_to_show else None)
1464
1463
  msg = managed_jobs.format_job_table(managed_jobs_,
1465
1464
  show_all=show_all,
1466
1465
  show_user=show_user,
1467
- max_jobs=max_jobs_to_show)
1466
+ max_jobs=max_num_jobs_to_show)
1468
1467
  return num_in_progress_jobs, msg
1469
1468
 
1470
1469
 
@@ -1875,7 +1874,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1875
1874
  managed_jobs_queue_request_id,
1876
1875
  show_all=False,
1877
1876
  show_user=all_users,
1878
- limit_num_jobs_to_show=not all,
1877
+ max_num_jobs_to_show=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
1879
1878
  is_called_by_user=False)
1880
1879
  except KeyboardInterrupt:
1881
1880
  sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
@@ -3943,10 +3942,15 @@ def jobs_launch(
3943
3942
  is_flag=True,
3944
3943
  required=False,
3945
3944
  help='Show jobs from all users.')
3945
+ @click.option('--all',
3946
+ default=False,
3947
+ is_flag=True,
3948
+ required=False,
3949
+ help='Show all jobs.')
3946
3950
  @usage_lib.entrypoint
3947
3951
  # pylint: disable=redefined-builtin
3948
3952
  def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
3949
- all_users: bool):
3953
+ all_users: bool, all: bool):
3950
3954
  """Show statuses of managed jobs.
3951
3955
 
3952
3956
  Each managed jobs can have one of the following statuses:
@@ -4004,10 +4008,13 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4004
4008
  with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
4005
4009
  managed_jobs_request_id = managed_jobs.queue(
4006
4010
  refresh=refresh, skip_finished=skip_finished, all_users=all_users)
4007
- _, msg = _handle_jobs_queue_request(managed_jobs_request_id,
4008
- show_all=verbose,
4009
- show_user=all_users,
4010
- is_called_by_user=True)
4011
+ max_num_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW if not all else None)
4012
+ num_jobs, msg = _handle_jobs_queue_request(
4013
+ managed_jobs_request_id,
4014
+ show_all=verbose,
4015
+ show_user=all_users,
4016
+ max_num_jobs_to_show=max_num_jobs_to_show,
4017
+ is_called_by_user=True)
4011
4018
  if not skip_finished:
4012
4019
  in_progress_only_hint = ''
4013
4020
  else:
@@ -4015,6 +4022,12 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4015
4022
  click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
4016
4023
  f'Managed jobs{colorama.Style.RESET_ALL}'
4017
4024
  f'{in_progress_only_hint}\n{msg}')
4025
+ if max_num_jobs_to_show and num_jobs and max_num_jobs_to_show < num_jobs:
4026
+ click.echo(
4027
+ f'{colorama.Fore.CYAN}'
4028
+ f'Only showing the latest {max_num_jobs_to_show} '
4029
+ f'managed jobs'
4030
+ f'(use --all to show all managed jobs) {colorama.Style.RESET_ALL} ')
4018
4031
 
4019
4032
 
4020
4033
  @jobs.command('cancel', cls=_DocumentedCodeCommand)
sky/client/cli.py CHANGED
@@ -100,6 +100,7 @@ an autogenerated name."""
100
100
  # The maximum number of in-progress managed jobs to show in the status
101
101
  # command.
102
102
  _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5
103
+ _NUM_MANAGED_JOBS_TO_SHOW = 50
103
104
 
104
105
  _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
105
106
  '{cluster_num} cluster{plural} {verb}. Please specify {cause} '
@@ -1389,16 +1390,16 @@ def _handle_jobs_queue_request(
1389
1390
  request_id: str,
1390
1391
  show_all: bool,
1391
1392
  show_user: bool,
1392
- limit_num_jobs_to_show: bool = False,
1393
+ max_num_jobs_to_show: Optional[int],
1393
1394
  is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1394
1395
  """Get the in-progress managed jobs.
1395
1396
 
1396
1397
  Args:
1397
1398
  show_all: Show all information of each job (e.g., region, price).
1398
1399
  show_user: Show the user who submitted the job.
1399
- limit_num_jobs_to_show: If True, limit the number of jobs to show to
1400
- _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS, which is mainly used by
1401
- `sky status`.
1400
+ max_num_jobs_to_show: If not None, limit the number of jobs to show to
1401
+ this number, which is mainly used by `sky status`
1402
+ and `sky jobs queue`.
1402
1403
  is_called_by_user: If this function is called by user directly, or an
1403
1404
  internal call.
1404
1405
 
@@ -1459,12 +1460,10 @@ def _handle_jobs_queue_request(
1459
1460
  msg += ('Failed to query managed jobs: '
1460
1461
  f'{common_utils.format_exception(e, use_bracket=True)}')
1461
1462
  else:
1462
- max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS
1463
- if limit_num_jobs_to_show else None)
1464
1463
  msg = managed_jobs.format_job_table(managed_jobs_,
1465
1464
  show_all=show_all,
1466
1465
  show_user=show_user,
1467
- max_jobs=max_jobs_to_show)
1466
+ max_jobs=max_num_jobs_to_show)
1468
1467
  return num_in_progress_jobs, msg
1469
1468
 
1470
1469
 
@@ -1875,7 +1874,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1875
1874
  managed_jobs_queue_request_id,
1876
1875
  show_all=False,
1877
1876
  show_user=all_users,
1878
- limit_num_jobs_to_show=not all,
1877
+ max_num_jobs_to_show=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
1879
1878
  is_called_by_user=False)
1880
1879
  except KeyboardInterrupt:
1881
1880
  sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
@@ -3943,10 +3942,15 @@ def jobs_launch(
3943
3942
  is_flag=True,
3944
3943
  required=False,
3945
3944
  help='Show jobs from all users.')
3945
+ @click.option('--all',
3946
+ default=False,
3947
+ is_flag=True,
3948
+ required=False,
3949
+ help='Show all jobs.')
3946
3950
  @usage_lib.entrypoint
3947
3951
  # pylint: disable=redefined-builtin
3948
3952
  def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
3949
- all_users: bool):
3953
+ all_users: bool, all: bool):
3950
3954
  """Show statuses of managed jobs.
3951
3955
 
3952
3956
  Each managed jobs can have one of the following statuses:
@@ -4004,10 +4008,13 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4004
4008
  with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
4005
4009
  managed_jobs_request_id = managed_jobs.queue(
4006
4010
  refresh=refresh, skip_finished=skip_finished, all_users=all_users)
4007
- _, msg = _handle_jobs_queue_request(managed_jobs_request_id,
4008
- show_all=verbose,
4009
- show_user=all_users,
4010
- is_called_by_user=True)
4011
+ max_num_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW if not all else None)
4012
+ num_jobs, msg = _handle_jobs_queue_request(
4013
+ managed_jobs_request_id,
4014
+ show_all=verbose,
4015
+ show_user=all_users,
4016
+ max_num_jobs_to_show=max_num_jobs_to_show,
4017
+ is_called_by_user=True)
4011
4018
  if not skip_finished:
4012
4019
  in_progress_only_hint = ''
4013
4020
  else:
@@ -4015,6 +4022,12 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4015
4022
  click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
4016
4023
  f'Managed jobs{colorama.Style.RESET_ALL}'
4017
4024
  f'{in_progress_only_hint}\n{msg}')
4025
+ if max_num_jobs_to_show and num_jobs and max_num_jobs_to_show < num_jobs:
4026
+ click.echo(
4027
+ f'{colorama.Fore.CYAN}'
4028
+ f'Only showing the latest {max_num_jobs_to_show} '
4029
+ f'managed jobs'
4030
+ f'(use --all to show all managed jobs) {colorama.Style.RESET_ALL} ')
4018
4031
 
4019
4032
 
4020
4033
  @jobs.command('cancel', cls=_DocumentedCodeCommand)
sky/client/sdk.py CHANGED
@@ -32,7 +32,6 @@ from sky import sky_logging
32
32
  from sky import skypilot_config
33
33
  from sky.client import common as client_common
34
34
  from sky.server import common as server_common
35
- from sky.server import constants as server_constants
36
35
  from sky.server.requests import payloads
37
36
  from sky.server.requests import requests as requests_lib
38
37
  from sky.skylet import constants
@@ -1707,14 +1706,8 @@ def api_stop() -> None:
1707
1706
  force=True)
1708
1707
  found = True
1709
1708
 
1710
- # Remove the database for requests including any files starting with
1711
- # api.constants.API_SERVER_REQUEST_DB_PATH
1712
- db_path = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
1713
- for extension in ['', '-shm', '-wal']:
1714
- try:
1715
- os.remove(f'{db_path}{extension}')
1716
- except FileNotFoundError:
1717
- logger.debug(f'Database file {db_path}{extension} not found.')
1709
+ # Remove the database for requests.
1710
+ server_common.clear_local_api_server_database()
1718
1711
 
1719
1712
  if found:
1720
1713
  logger.info(f'{colorama.Fore.GREEN}SkyPilot API server stopped.'
sky/clouds/gcp.py CHANGED
@@ -843,7 +843,10 @@ class GCP(clouds.Cloud):
843
843
  permissions = {'permissions': gcp_minimal_permissions}
844
844
  request = crm.projects().testIamPermissions(resource=project,
845
845
  body=permissions)
846
- ret_permissions = request.execute().get('permissions', [])
846
+ try:
847
+ ret_permissions = request.execute().get('permissions', [])
848
+ except gcp.gcp_auth_refresh_error_exception() as e:
849
+ return False, common_utils.format_exception(e, use_bracket=True)
847
850
 
848
851
  diffs = set(gcp_minimal_permissions).difference(set(ret_permissions))
849
852
  if diffs:
sky/clouds/nebius.py CHANGED
@@ -17,6 +17,7 @@ _CREDENTIAL_FILES = [
17
17
  nebius.NEBIUS_TENANT_ID_FILENAME,
18
18
  nebius.NEBIUS_IAM_TOKEN_FILENAME,
19
19
  nebius.NEBIUS_PROJECT_ID_FILENAME,
20
+ nebius.NEBIUS_CREDENTIALS_FILENAME
20
21
  ]
21
22
 
22
23
 
@@ -252,15 +253,16 @@ class Nebius(clouds.Cloud):
252
253
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
253
254
  """ Verify that the user has valid credentials for Nebius. """
254
255
  logging.debug('Nebius cloud check credentials')
255
- token = nebius.get_iam_token()
256
- token_msg = (' Credentials can be set up by running: \n'\
257
- f' $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n') # pylint: disable=line-too-long
256
+ token_cred_msg = (' Credentials can be set up by running: \n'\
257
+ f' $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n'\
258
+ ' or generate ~/.nebius/credentials.json') # pylint: disable=line-too-long
259
+
258
260
  tenant_msg = (' Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
259
261
  f' $ echo $NEBIUS_TENANT_ID_PATH > {nebius.NEBIUS_TENANT_ID_PATH} \n' # pylint: disable=line-too-long
260
262
  ' Or if you have 1 tenant you can run:\n' # pylint: disable=line-too-long
261
263
  f' $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n') # pylint: disable=line-too-long
262
- if token is None:
263
- return False, f'{token_msg}'
264
+ if not nebius.is_token_or_cred_file_exist():
265
+ return False, f'{token_cred_msg}'
264
266
  sdk = nebius.sdk()
265
267
  tenant_id = nebius.get_tenant_id()
266
268
  if tenant_id is None:
@@ -272,7 +274,7 @@ class Nebius(clouds.Cloud):
272
274
  except nebius.request_error() as e:
273
275
  return False, (
274
276
  f'{e.status} \n' # First line is indented by 4 spaces
275
- f'{token_msg}'
277
+ f'{token_cred_msg}'
276
278
  f'{tenant_msg}')
277
279
  return True, None
278
280
 
sky/data/storage.py CHANGED
@@ -143,6 +143,22 @@ class StoreType(enum.Enum):
143
143
 
144
144
  raise ValueError(f'Unsupported cloud for StoreType: {cloud}')
145
145
 
146
+ def to_cloud(self) -> str:
147
+ if self == StoreType.S3:
148
+ return str(clouds.AWS())
149
+ elif self == StoreType.GCS:
150
+ return str(clouds.GCP())
151
+ elif self == StoreType.AZURE:
152
+ return str(clouds.Azure())
153
+ elif self == StoreType.R2:
154
+ return cloudflare.NAME
155
+ elif self == StoreType.IBM:
156
+ return str(clouds.IBM())
157
+ elif self == StoreType.OCI:
158
+ return str(clouds.OCI())
159
+ else:
160
+ raise ValueError(f'Unknown store type: {self}')
161
+
146
162
  @classmethod
147
163
  def from_store(cls, store: 'AbstractStore') -> 'StoreType':
148
164
  if isinstance(store, S3Store):
sky/exceptions.py CHANGED
@@ -28,12 +28,19 @@ GIT_FATAL_EXIT_CODE = 128
28
28
  ARCH_NOT_SUPPORTED_EXIT_CODE = 133
29
29
 
30
30
 
31
- def is_safe_exception(exc: Exception) -> bool:
31
+ def is_safe_exception(exc: BaseException) -> bool:
32
32
  """Returns True if the exception is safe to send to clients.
33
33
 
34
34
  Safe exceptions are:
35
35
  1. Built-in exceptions
36
36
  2. SkyPilot's own exceptions
37
+
38
+ Args:
39
+ exc: The exception to check, accept BaseException to handle SystemExit
40
+ and KeyboardInterrupt.
41
+
42
+ Returns:
43
+ True if the exception is safe to send to clients, False otherwise.
37
44
  """
38
45
  module = type(exc).__module__
39
46
 
@@ -48,7 +55,7 @@ def is_safe_exception(exc: Exception) -> bool:
48
55
  return False
49
56
 
50
57
 
51
- def wrap_exception(exc: Exception) -> Exception:
58
+ def wrap_exception(exc: BaseException) -> BaseException:
52
59
  """Wraps non-safe exceptions into SkyPilot exceptions
53
60
 
54
61
  This is used to wrap exceptions that are not safe to deserialize at clients.
@@ -64,7 +71,8 @@ def wrap_exception(exc: Exception) -> Exception:
64
71
  error_type=type(exc).__name__)
65
72
 
66
73
 
67
- def serialize_exception(e: Exception) -> Dict[str, Any]:
74
+ # Accept BaseException to handle SystemExit and KeyboardInterrupt
75
+ def serialize_exception(e: BaseException) -> Dict[str, Any]:
68
76
  """Serialize the exception.
69
77
 
70
78
  This function also wraps any unsafe exceptions (e.g., cloud exceptions)
@@ -717,6 +717,15 @@ def check_instance_fits(context: Optional[str],
717
717
  fits, reason = check_tpu_fits(k8s_instance_type, gpu_nodes)
718
718
  if reason is not None:
719
719
  return fits, reason
720
+ else:
721
+ # Check if any of the GPU nodes have sufficient number of GPUs.
722
+ gpu_nodes = [
723
+ node for node in gpu_nodes if
724
+ get_node_accelerator_count(node.status.allocatable) >= acc_count
725
+ ]
726
+ if not gpu_nodes:
727
+ return False, (
728
+ f'No GPU nodes found with {acc_count} or more GPUs.')
720
729
 
721
730
  candidate_nodes = gpu_nodes
722
731
  not_fit_reason_prefix = (
@@ -853,7 +862,7 @@ def get_accelerator_label_key_value(
853
862
  for label, value in label_list:
854
863
  if (label_formatter.match_label_key(label) and
855
864
  label_formatter.get_accelerator_from_label_value(
856
- value) == acc_type):
865
+ value).lower() == acc_type.lower()):
857
866
  if is_tpu_on_gke(acc_type):
858
867
  assert isinstance(label_formatter,
859
868
  GKELabelFormatter)
sky/server/common.py CHANGED
@@ -428,3 +428,19 @@ def reload_for_new_request(client_entrypoint: Optional[str],
428
428
  # necessary because the logger is initialized before the environment
429
429
  # variables are set, such as SKYPILOT_DEBUG.
430
430
  sky_logging.reload_logger()
431
+
432
+
433
+ def clear_local_api_server_database() -> None:
434
+ """Removes the local API server database.
435
+
436
+ The CLI can call this during cleanup of a local API server, or the API
437
+ server can call it during startup.
438
+ """
439
+ # Remove the database for requests including any files starting with
440
+ # api.constants.API_SERVER_REQUEST_DB_PATH
441
+ db_path = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
442
+ for extension in ['', '-shm', '-wal']:
443
+ try:
444
+ os.remove(f'{db_path}{extension}')
445
+ except FileNotFoundError:
446
+ logger.debug(f'Database file {db_path}{extension} not found.')
@@ -0,0 +1,31 @@
1
+ """Executor event loop to process tasks in coroutines."""
2
+ import asyncio
3
+ import concurrent.futures
4
+ import threading
5
+ from typing import Coroutine, Optional
6
+
7
+ # Dedicated event loop for requests, isolated with the event loop managed
8
+ # by uvicorn. This is responsible for light-weight async tasks or sub-tasks,
9
+ # refer to `executor.py` for more details about cooperation between the event
10
+ # loop and executor process pool.
11
+ _EVENT_LOOP: Optional[asyncio.AbstractEventLoop] = None
12
+ _LOCK = threading.Lock()
13
+
14
+
15
+ def run(coro: Coroutine) -> concurrent.futures.Future:
16
+ """Run a coroutine asynchronously in the request event loop."""
17
+ return asyncio.run_coroutine_threadsafe(coro, get_event_loop())
18
+
19
+
20
+ def get_event_loop() -> asyncio.AbstractEventLoop:
21
+ """Open and get the event loop."""
22
+ global _EVENT_LOOP
23
+ if _EVENT_LOOP is not None and not _EVENT_LOOP.is_closed():
24
+ return _EVENT_LOOP
25
+ with _LOCK:
26
+ if _EVENT_LOOP is None or _EVENT_LOOP.is_closed():
27
+ _EVENT_LOOP = asyncio.new_event_loop()
28
+ loop_thread = threading.Thread(target=_EVENT_LOOP.run_forever,
29
+ daemon=True)
30
+ loop_thread.start()
31
+ return _EVENT_LOOP