skypilot-nightly 1.0.0.dev20250311__py3-none-any.whl → 1.0.0.dev20250313__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/gcp.py +7 -0
- sky/adaptors/nebius.py +11 -1
- sky/backends/backend_utils.py +38 -15
- sky/backends/cloud_vm_ray_backend.py +17 -52
- sky/cli.py +26 -13
- sky/client/cli.py +26 -13
- sky/client/sdk.py +2 -9
- sky/clouds/gcp.py +4 -1
- sky/clouds/nebius.py +8 -6
- sky/data/storage.py +16 -0
- sky/exceptions.py +11 -3
- sky/provision/kubernetes/utils.py +10 -1
- sky/server/common.py +16 -0
- sky/server/requests/event_loop.py +31 -0
- sky/server/requests/executor.py +50 -22
- sky/server/requests/preconditions.py +174 -0
- sky/server/requests/requests.py +43 -4
- sky/server/server.py +29 -8
- sky/server/stream_utils.py +9 -6
- sky/server/uvicorn.py +81 -0
- sky/setup_files/dependencies.py +4 -1
- sky/utils/accelerator_registry.py +1 -1
- sky/utils/controller_utils.py +10 -0
- sky/utils/subprocess_utils.py +56 -1
- {skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250313.dist-info}/METADATA +3 -3
- {skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250313.dist-info}/RECORD +31 -28
- {skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250313.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250313.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250313.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250311.dist-info → skypilot_nightly-1.0.0.dev20250313.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '6044bbfe8712221e8d0da08ce8ce7ec36ab66caf'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250313'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/adaptors/gcp.py
CHANGED
@@ -68,6 +68,13 @@ def credential_error_exception():
|
|
68
68
|
return exceptions.DefaultCredentialsError
|
69
69
|
|
70
70
|
|
71
|
+
@common.load_lazy_modules(_LAZY_MODULES)
|
72
|
+
def gcp_auth_refresh_error_exception():
|
73
|
+
"""GCP auth refresh error exception."""
|
74
|
+
from google.auth import exceptions
|
75
|
+
return exceptions.RefreshError
|
76
|
+
|
77
|
+
|
71
78
|
@common.load_lazy_modules(_LAZY_MODULES)
|
72
79
|
def get_credentials(cred_type: str, credentials_field: str):
|
73
80
|
"""Get GCP credentials."""
|
sky/adaptors/nebius.py
CHANGED
@@ -6,9 +6,11 @@ from sky.adaptors import common
|
|
6
6
|
NEBIUS_TENANT_ID_FILENAME = 'NEBIUS_TENANT_ID.txt'
|
7
7
|
NEBIUS_IAM_TOKEN_FILENAME = 'NEBIUS_IAM_TOKEN.txt'
|
8
8
|
NEBIUS_PROJECT_ID_FILENAME = 'NEBIUS_PROJECT_ID.txt'
|
9
|
+
NEBIUS_CREDENTIALS_FILENAME = 'credentials.json'
|
9
10
|
NEBIUS_TENANT_ID_PATH = '~/.nebius/' + NEBIUS_TENANT_ID_FILENAME
|
10
11
|
NEBIUS_IAM_TOKEN_PATH = '~/.nebius/' + NEBIUS_IAM_TOKEN_FILENAME
|
11
12
|
NEBIUS_PROJECT_ID_PATH = '~/.nebius/' + NEBIUS_PROJECT_ID_FILENAME
|
13
|
+
NEBIUS_CREDENTIALS_PATH = '~/.nebius/' + NEBIUS_CREDENTIALS_FILENAME
|
12
14
|
|
13
15
|
MAX_RETRIES_TO_DISK_CREATE = 120
|
14
16
|
MAX_RETRIES_TO_INSTANCE_STOP = 120
|
@@ -72,6 +74,11 @@ def get_iam_token():
|
|
72
74
|
return _iam_token
|
73
75
|
|
74
76
|
|
77
|
+
def is_token_or_cred_file_exist():
|
78
|
+
return (os.path.exists(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH)) or
|
79
|
+
os.path.exists(os.path.expanduser(NEBIUS_CREDENTIALS_PATH)))
|
80
|
+
|
81
|
+
|
75
82
|
def get_project_id():
|
76
83
|
global _project_id
|
77
84
|
if _project_id is None:
|
@@ -97,4 +104,7 @@ def get_tenant_id():
|
|
97
104
|
|
98
105
|
|
99
106
|
def sdk():
|
100
|
-
|
107
|
+
if get_iam_token() is not None:
|
108
|
+
return nebius.sdk.SDK(credentials=get_iam_token())
|
109
|
+
return nebius.sdk.SDK(
|
110
|
+
credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
|
sky/backends/backend_utils.py
CHANGED
@@ -1802,6 +1802,21 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
1802
1802
|
status == status_lib.ClusterStatus.UP for status in node_statuses) and
|
1803
1803
|
len(node_statuses) == handle.launched_nodes)
|
1804
1804
|
|
1805
|
+
def get_node_counts_from_ray_status(
|
1806
|
+
runner: command_runner.CommandRunner) -> Tuple[int, int, str, str]:
|
1807
|
+
rc, output, stderr = runner.run(
|
1808
|
+
instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
|
1809
|
+
stream_logs=False,
|
1810
|
+
require_outputs=True,
|
1811
|
+
separate_stderr=True)
|
1812
|
+
if rc:
|
1813
|
+
raise RuntimeError(
|
1814
|
+
f'Refreshing status ({cluster_name!r}): Failed to check '
|
1815
|
+
f'ray cluster\'s healthiness with '
|
1816
|
+
f'{instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND}.\n'
|
1817
|
+
f'-- stdout --\n{output}\n-- stderr --\n{stderr}')
|
1818
|
+
return (*_count_healthy_nodes_from_ray(output), output, stderr)
|
1819
|
+
|
1805
1820
|
def run_ray_status_to_check_ray_cluster_healthy() -> bool:
|
1806
1821
|
try:
|
1807
1822
|
# NOTE: fetching the IPs is very slow as it calls into
|
@@ -1822,26 +1837,34 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
1822
1837
|
raise exceptions.FetchClusterInfoError(
|
1823
1838
|
reason=exceptions.FetchClusterInfoError.Reason.HEAD)
|
1824
1839
|
head_runner = runners[0]
|
1825
|
-
rc, output, stderr = head_runner.run(
|
1826
|
-
instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
|
1827
|
-
stream_logs=False,
|
1828
|
-
require_outputs=True,
|
1829
|
-
separate_stderr=True)
|
1830
|
-
if rc:
|
1831
|
-
raise RuntimeError(
|
1832
|
-
f'Refreshing status ({cluster_name!r}): Failed to check '
|
1833
|
-
f'ray cluster\'s healthiness with '
|
1834
|
-
f'{instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND}.\n'
|
1835
|
-
f'-- stdout --\n{output}\n-- stderr --\n{stderr}')
|
1836
1840
|
|
1837
|
-
ready_head, ready_workers = _count_healthy_nodes_from_ray(output)
|
1838
1841
|
total_nodes = handle.launched_nodes * handle.num_ips_per_node
|
1839
|
-
|
1840
|
-
|
1842
|
+
|
1843
|
+
for i in range(5):
|
1844
|
+
ready_head, ready_workers, output, stderr = (
|
1845
|
+
get_node_counts_from_ray_status(head_runner))
|
1846
|
+
if ready_head + ready_workers == total_nodes:
|
1847
|
+
return True
|
1848
|
+
logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
|
1849
|
+
f'{i}: ray status not showing all nodes '
|
1850
|
+
f'({ready_head + ready_workers}/{total_nodes});\n'
|
1851
|
+
f'output:\n{output}\nstderr:\n{stderr}')
|
1852
|
+
|
1853
|
+
# If cluster JUST started, maybe not all the nodes have shown
|
1854
|
+
# up. Try again for a few seconds.
|
1855
|
+
# Note: We are okay with this performance hit because it's very
|
1856
|
+
# rare to normally hit this case. It requires:
|
1857
|
+
# - All the instances in the cluster are up on the cloud side
|
1858
|
+
# (not preempted), but
|
1859
|
+
# - The ray cluster is somehow degraded so not all instances are
|
1860
|
+
# showing up
|
1861
|
+
time.sleep(1)
|
1862
|
+
|
1841
1863
|
raise RuntimeError(
|
1842
1864
|
f'Refreshing status ({cluster_name!r}): ray status not showing '
|
1843
1865
|
f'all nodes ({ready_head + ready_workers}/'
|
1844
|
-
f'{total_nodes})
|
1866
|
+
f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
|
1867
|
+
|
1845
1868
|
except exceptions.FetchClusterInfoError:
|
1846
1869
|
logger.debug(
|
1847
1870
|
f'Refreshing status ({cluster_name!r}) failed to get IPs.')
|
@@ -772,32 +772,6 @@ class FailoverCloudErrorHandlerV1:
|
|
772
772
|
setattr(e, 'detailed_reason', detailed_reason)
|
773
773
|
raise e
|
774
774
|
|
775
|
-
@staticmethod
|
776
|
-
def _lambda_handler(blocked_resources: Set['resources_lib.Resources'],
|
777
|
-
launchable_resources: 'resources_lib.Resources',
|
778
|
-
region: 'clouds.Region',
|
779
|
-
zones: Optional[List['clouds.Zone']], stdout: str,
|
780
|
-
stderr: str):
|
781
|
-
del region, zones # Unused.
|
782
|
-
errors = FailoverCloudErrorHandlerV1._handle_errors(
|
783
|
-
stdout,
|
784
|
-
stderr,
|
785
|
-
is_error_str_known=lambda x: 'LambdaCloudError:' in x.strip())
|
786
|
-
messages = '\n '.join(errors)
|
787
|
-
style = colorama.Style
|
788
|
-
logger.warning(f' {style.DIM}{messages}{style.RESET_ALL}')
|
789
|
-
_add_to_blocked_resources(blocked_resources,
|
790
|
-
launchable_resources.copy(zone=None))
|
791
|
-
|
792
|
-
# Sometimes, LambdaCloudError will list available regions.
|
793
|
-
for e in errors:
|
794
|
-
if e.find('Regions with capacity available:') != -1:
|
795
|
-
for r in service_catalog.regions('lambda'):
|
796
|
-
if e.find(r.name) == -1:
|
797
|
-
_add_to_blocked_resources(
|
798
|
-
blocked_resources,
|
799
|
-
launchable_resources.copy(region=r.name, zone=None))
|
800
|
-
|
801
775
|
@staticmethod
|
802
776
|
def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
|
803
777
|
launchable_resources: 'resources_lib.Resources',
|
@@ -846,32 +820,6 @@ class FailoverCloudErrorHandlerV1:
|
|
846
820
|
_add_to_blocked_resources(blocked_resources,
|
847
821
|
launchable_resources.copy(zone=zone.name))
|
848
822
|
|
849
|
-
# Apr, 2023 by Hysun(hysun.he@oracle.com): Added support for OCI
|
850
|
-
@staticmethod
|
851
|
-
def _oci_handler(blocked_resources: Set['resources_lib.Resources'],
|
852
|
-
launchable_resources: 'resources_lib.Resources',
|
853
|
-
region: 'clouds.Region',
|
854
|
-
zones: Optional[List['clouds.Zone']], stdout: str,
|
855
|
-
stderr: str):
|
856
|
-
known_service_errors = [
|
857
|
-
'NotAuthorizedOrNotFound', 'CannotParseRequest', 'InternalError',
|
858
|
-
'LimitExceeded', 'NotAuthenticated'
|
859
|
-
]
|
860
|
-
errors = FailoverCloudErrorHandlerV1._handle_errors(
|
861
|
-
stdout, stderr, lambda x: 'VcnSubnetNotFound' in x.strip() or
|
862
|
-
('oci.exceptions.ServiceError' in x.strip() and any(
|
863
|
-
known_err in x.strip() for known_err in known_service_errors)))
|
864
|
-
logger.warning(f'Got error(s) in {region.name}:')
|
865
|
-
messages = '\n\t'.join(errors)
|
866
|
-
style = colorama.Style
|
867
|
-
logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
|
868
|
-
|
869
|
-
if zones is not None:
|
870
|
-
for zone in zones:
|
871
|
-
_add_to_blocked_resources(
|
872
|
-
blocked_resources,
|
873
|
-
launchable_resources.copy(zone=zone.name))
|
874
|
-
|
875
823
|
@staticmethod
|
876
824
|
def update_blocklist_on_error(
|
877
825
|
blocked_resources: Set['resources_lib.Resources'],
|
@@ -1123,6 +1071,23 @@ class FailoverCloudErrorHandlerV2:
|
|
1123
1071
|
blocked_resources,
|
1124
1072
|
launchable_resources.copy(zone=zone.name))
|
1125
1073
|
|
1074
|
+
@staticmethod
|
1075
|
+
def _lambda_handler(blocked_resources: Set['resources_lib.Resources'],
|
1076
|
+
launchable_resources: 'resources_lib.Resources',
|
1077
|
+
region: 'clouds.Region',
|
1078
|
+
zones: Optional[List['clouds.Zone']], error: Exception):
|
1079
|
+
output = str(error)
|
1080
|
+
# Sometimes, lambda cloud error will list available regions.
|
1081
|
+
if output.find('Regions with capacity available:') != -1:
|
1082
|
+
for r in service_catalog.regions('lambda'):
|
1083
|
+
if output.find(r.name) == -1:
|
1084
|
+
_add_to_blocked_resources(
|
1085
|
+
blocked_resources,
|
1086
|
+
launchable_resources.copy(region=r.name, zone=None))
|
1087
|
+
else:
|
1088
|
+
FailoverCloudErrorHandlerV2._default_handler(
|
1089
|
+
blocked_resources, launchable_resources, region, zones, error)
|
1090
|
+
|
1126
1091
|
@staticmethod
|
1127
1092
|
def _default_handler(blocked_resources: Set['resources_lib.Resources'],
|
1128
1093
|
launchable_resources: 'resources_lib.Resources',
|
sky/cli.py
CHANGED
@@ -100,6 +100,7 @@ an autogenerated name."""
|
|
100
100
|
# The maximum number of in-progress managed jobs to show in the status
|
101
101
|
# command.
|
102
102
|
_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5
|
103
|
+
_NUM_MANAGED_JOBS_TO_SHOW = 50
|
103
104
|
|
104
105
|
_STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
|
105
106
|
'{cluster_num} cluster{plural} {verb}. Please specify {cause} '
|
@@ -1389,16 +1390,16 @@ def _handle_jobs_queue_request(
|
|
1389
1390
|
request_id: str,
|
1390
1391
|
show_all: bool,
|
1391
1392
|
show_user: bool,
|
1392
|
-
|
1393
|
+
max_num_jobs_to_show: Optional[int],
|
1393
1394
|
is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
|
1394
1395
|
"""Get the in-progress managed jobs.
|
1395
1396
|
|
1396
1397
|
Args:
|
1397
1398
|
show_all: Show all information of each job (e.g., region, price).
|
1398
1399
|
show_user: Show the user who submitted the job.
|
1399
|
-
|
1400
|
-
|
1401
|
-
`sky
|
1400
|
+
max_num_jobs_to_show: If not None, limit the number of jobs to show to
|
1401
|
+
this number, which is mainly used by `sky status`
|
1402
|
+
and `sky jobs queue`.
|
1402
1403
|
is_called_by_user: If this function is called by user directly, or an
|
1403
1404
|
internal call.
|
1404
1405
|
|
@@ -1459,12 +1460,10 @@ def _handle_jobs_queue_request(
|
|
1459
1460
|
msg += ('Failed to query managed jobs: '
|
1460
1461
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
1461
1462
|
else:
|
1462
|
-
max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS
|
1463
|
-
if limit_num_jobs_to_show else None)
|
1464
1463
|
msg = managed_jobs.format_job_table(managed_jobs_,
|
1465
1464
|
show_all=show_all,
|
1466
1465
|
show_user=show_user,
|
1467
|
-
max_jobs=
|
1466
|
+
max_jobs=max_num_jobs_to_show)
|
1468
1467
|
return num_in_progress_jobs, msg
|
1469
1468
|
|
1470
1469
|
|
@@ -1875,7 +1874,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1875
1874
|
managed_jobs_queue_request_id,
|
1876
1875
|
show_all=False,
|
1877
1876
|
show_user=all_users,
|
1878
|
-
|
1877
|
+
max_num_jobs_to_show=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
|
1879
1878
|
is_called_by_user=False)
|
1880
1879
|
except KeyboardInterrupt:
|
1881
1880
|
sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
|
@@ -3943,10 +3942,15 @@ def jobs_launch(
|
|
3943
3942
|
is_flag=True,
|
3944
3943
|
required=False,
|
3945
3944
|
help='Show jobs from all users.')
|
3945
|
+
@click.option('--all',
|
3946
|
+
default=False,
|
3947
|
+
is_flag=True,
|
3948
|
+
required=False,
|
3949
|
+
help='Show all jobs.')
|
3946
3950
|
@usage_lib.entrypoint
|
3947
3951
|
# pylint: disable=redefined-builtin
|
3948
3952
|
def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
3949
|
-
all_users: bool):
|
3953
|
+
all_users: bool, all: bool):
|
3950
3954
|
"""Show statuses of managed jobs.
|
3951
3955
|
|
3952
3956
|
Each managed jobs can have one of the following statuses:
|
@@ -4004,10 +4008,13 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
4004
4008
|
with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
|
4005
4009
|
managed_jobs_request_id = managed_jobs.queue(
|
4006
4010
|
refresh=refresh, skip_finished=skip_finished, all_users=all_users)
|
4007
|
-
|
4008
|
-
|
4009
|
-
|
4010
|
-
|
4011
|
+
max_num_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW if not all else None)
|
4012
|
+
num_jobs, msg = _handle_jobs_queue_request(
|
4013
|
+
managed_jobs_request_id,
|
4014
|
+
show_all=verbose,
|
4015
|
+
show_user=all_users,
|
4016
|
+
max_num_jobs_to_show=max_num_jobs_to_show,
|
4017
|
+
is_called_by_user=True)
|
4011
4018
|
if not skip_finished:
|
4012
4019
|
in_progress_only_hint = ''
|
4013
4020
|
else:
|
@@ -4015,6 +4022,12 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
4015
4022
|
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
4016
4023
|
f'Managed jobs{colorama.Style.RESET_ALL}'
|
4017
4024
|
f'{in_progress_only_hint}\n{msg}')
|
4025
|
+
if max_num_jobs_to_show and num_jobs and max_num_jobs_to_show < num_jobs:
|
4026
|
+
click.echo(
|
4027
|
+
f'{colorama.Fore.CYAN}'
|
4028
|
+
f'Only showing the latest {max_num_jobs_to_show} '
|
4029
|
+
f'managed jobs'
|
4030
|
+
f'(use --all to show all managed jobs) {colorama.Style.RESET_ALL} ')
|
4018
4031
|
|
4019
4032
|
|
4020
4033
|
@jobs.command('cancel', cls=_DocumentedCodeCommand)
|
sky/client/cli.py
CHANGED
@@ -100,6 +100,7 @@ an autogenerated name."""
|
|
100
100
|
# The maximum number of in-progress managed jobs to show in the status
|
101
101
|
# command.
|
102
102
|
_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5
|
103
|
+
_NUM_MANAGED_JOBS_TO_SHOW = 50
|
103
104
|
|
104
105
|
_STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
|
105
106
|
'{cluster_num} cluster{plural} {verb}. Please specify {cause} '
|
@@ -1389,16 +1390,16 @@ def _handle_jobs_queue_request(
|
|
1389
1390
|
request_id: str,
|
1390
1391
|
show_all: bool,
|
1391
1392
|
show_user: bool,
|
1392
|
-
|
1393
|
+
max_num_jobs_to_show: Optional[int],
|
1393
1394
|
is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
|
1394
1395
|
"""Get the in-progress managed jobs.
|
1395
1396
|
|
1396
1397
|
Args:
|
1397
1398
|
show_all: Show all information of each job (e.g., region, price).
|
1398
1399
|
show_user: Show the user who submitted the job.
|
1399
|
-
|
1400
|
-
|
1401
|
-
`sky
|
1400
|
+
max_num_jobs_to_show: If not None, limit the number of jobs to show to
|
1401
|
+
this number, which is mainly used by `sky status`
|
1402
|
+
and `sky jobs queue`.
|
1402
1403
|
is_called_by_user: If this function is called by user directly, or an
|
1403
1404
|
internal call.
|
1404
1405
|
|
@@ -1459,12 +1460,10 @@ def _handle_jobs_queue_request(
|
|
1459
1460
|
msg += ('Failed to query managed jobs: '
|
1460
1461
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
1461
1462
|
else:
|
1462
|
-
max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS
|
1463
|
-
if limit_num_jobs_to_show else None)
|
1464
1463
|
msg = managed_jobs.format_job_table(managed_jobs_,
|
1465
1464
|
show_all=show_all,
|
1466
1465
|
show_user=show_user,
|
1467
|
-
max_jobs=
|
1466
|
+
max_jobs=max_num_jobs_to_show)
|
1468
1467
|
return num_in_progress_jobs, msg
|
1469
1468
|
|
1470
1469
|
|
@@ -1875,7 +1874,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1875
1874
|
managed_jobs_queue_request_id,
|
1876
1875
|
show_all=False,
|
1877
1876
|
show_user=all_users,
|
1878
|
-
|
1877
|
+
max_num_jobs_to_show=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
|
1879
1878
|
is_called_by_user=False)
|
1880
1879
|
except KeyboardInterrupt:
|
1881
1880
|
sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
|
@@ -3943,10 +3942,15 @@ def jobs_launch(
|
|
3943
3942
|
is_flag=True,
|
3944
3943
|
required=False,
|
3945
3944
|
help='Show jobs from all users.')
|
3945
|
+
@click.option('--all',
|
3946
|
+
default=False,
|
3947
|
+
is_flag=True,
|
3948
|
+
required=False,
|
3949
|
+
help='Show all jobs.')
|
3946
3950
|
@usage_lib.entrypoint
|
3947
3951
|
# pylint: disable=redefined-builtin
|
3948
3952
|
def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
3949
|
-
all_users: bool):
|
3953
|
+
all_users: bool, all: bool):
|
3950
3954
|
"""Show statuses of managed jobs.
|
3951
3955
|
|
3952
3956
|
Each managed jobs can have one of the following statuses:
|
@@ -4004,10 +4008,13 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
4004
4008
|
with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
|
4005
4009
|
managed_jobs_request_id = managed_jobs.queue(
|
4006
4010
|
refresh=refresh, skip_finished=skip_finished, all_users=all_users)
|
4007
|
-
|
4008
|
-
|
4009
|
-
|
4010
|
-
|
4011
|
+
max_num_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW if not all else None)
|
4012
|
+
num_jobs, msg = _handle_jobs_queue_request(
|
4013
|
+
managed_jobs_request_id,
|
4014
|
+
show_all=verbose,
|
4015
|
+
show_user=all_users,
|
4016
|
+
max_num_jobs_to_show=max_num_jobs_to_show,
|
4017
|
+
is_called_by_user=True)
|
4011
4018
|
if not skip_finished:
|
4012
4019
|
in_progress_only_hint = ''
|
4013
4020
|
else:
|
@@ -4015,6 +4022,12 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
|
4015
4022
|
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
4016
4023
|
f'Managed jobs{colorama.Style.RESET_ALL}'
|
4017
4024
|
f'{in_progress_only_hint}\n{msg}')
|
4025
|
+
if max_num_jobs_to_show and num_jobs and max_num_jobs_to_show < num_jobs:
|
4026
|
+
click.echo(
|
4027
|
+
f'{colorama.Fore.CYAN}'
|
4028
|
+
f'Only showing the latest {max_num_jobs_to_show} '
|
4029
|
+
f'managed jobs'
|
4030
|
+
f'(use --all to show all managed jobs) {colorama.Style.RESET_ALL} ')
|
4018
4031
|
|
4019
4032
|
|
4020
4033
|
@jobs.command('cancel', cls=_DocumentedCodeCommand)
|
sky/client/sdk.py
CHANGED
@@ -32,7 +32,6 @@ from sky import sky_logging
|
|
32
32
|
from sky import skypilot_config
|
33
33
|
from sky.client import common as client_common
|
34
34
|
from sky.server import common as server_common
|
35
|
-
from sky.server import constants as server_constants
|
36
35
|
from sky.server.requests import payloads
|
37
36
|
from sky.server.requests import requests as requests_lib
|
38
37
|
from sky.skylet import constants
|
@@ -1707,14 +1706,8 @@ def api_stop() -> None:
|
|
1707
1706
|
force=True)
|
1708
1707
|
found = True
|
1709
1708
|
|
1710
|
-
# Remove the database for requests
|
1711
|
-
|
1712
|
-
db_path = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
|
1713
|
-
for extension in ['', '-shm', '-wal']:
|
1714
|
-
try:
|
1715
|
-
os.remove(f'{db_path}{extension}')
|
1716
|
-
except FileNotFoundError:
|
1717
|
-
logger.debug(f'Database file {db_path}{extension} not found.')
|
1709
|
+
# Remove the database for requests.
|
1710
|
+
server_common.clear_local_api_server_database()
|
1718
1711
|
|
1719
1712
|
if found:
|
1720
1713
|
logger.info(f'{colorama.Fore.GREEN}SkyPilot API server stopped.'
|
sky/clouds/gcp.py
CHANGED
@@ -843,7 +843,10 @@ class GCP(clouds.Cloud):
|
|
843
843
|
permissions = {'permissions': gcp_minimal_permissions}
|
844
844
|
request = crm.projects().testIamPermissions(resource=project,
|
845
845
|
body=permissions)
|
846
|
-
|
846
|
+
try:
|
847
|
+
ret_permissions = request.execute().get('permissions', [])
|
848
|
+
except gcp.gcp_auth_refresh_error_exception() as e:
|
849
|
+
return False, common_utils.format_exception(e, use_bracket=True)
|
847
850
|
|
848
851
|
diffs = set(gcp_minimal_permissions).difference(set(ret_permissions))
|
849
852
|
if diffs:
|
sky/clouds/nebius.py
CHANGED
@@ -17,6 +17,7 @@ _CREDENTIAL_FILES = [
|
|
17
17
|
nebius.NEBIUS_TENANT_ID_FILENAME,
|
18
18
|
nebius.NEBIUS_IAM_TOKEN_FILENAME,
|
19
19
|
nebius.NEBIUS_PROJECT_ID_FILENAME,
|
20
|
+
nebius.NEBIUS_CREDENTIALS_FILENAME
|
20
21
|
]
|
21
22
|
|
22
23
|
|
@@ -252,15 +253,16 @@ class Nebius(clouds.Cloud):
|
|
252
253
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
253
254
|
""" Verify that the user has valid credentials for Nebius. """
|
254
255
|
logging.debug('Nebius cloud check credentials')
|
255
|
-
|
256
|
-
|
257
|
-
|
256
|
+
token_cred_msg = (' Credentials can be set up by running: \n'\
|
257
|
+
f' $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n'\
|
258
|
+
' or generate ~/.nebius/credentials.json') # pylint: disable=line-too-long
|
259
|
+
|
258
260
|
tenant_msg = (' Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
|
259
261
|
f' $ echo $NEBIUS_TENANT_ID_PATH > {nebius.NEBIUS_TENANT_ID_PATH} \n' # pylint: disable=line-too-long
|
260
262
|
' Or if you have 1 tenant you can run:\n' # pylint: disable=line-too-long
|
261
263
|
f' $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n') # pylint: disable=line-too-long
|
262
|
-
if
|
263
|
-
return False, f'{
|
264
|
+
if not nebius.is_token_or_cred_file_exist():
|
265
|
+
return False, f'{token_cred_msg}'
|
264
266
|
sdk = nebius.sdk()
|
265
267
|
tenant_id = nebius.get_tenant_id()
|
266
268
|
if tenant_id is None:
|
@@ -272,7 +274,7 @@ class Nebius(clouds.Cloud):
|
|
272
274
|
except nebius.request_error() as e:
|
273
275
|
return False, (
|
274
276
|
f'{e.status} \n' # First line is indented by 4 spaces
|
275
|
-
f'{
|
277
|
+
f'{token_cred_msg}'
|
276
278
|
f'{tenant_msg}')
|
277
279
|
return True, None
|
278
280
|
|
sky/data/storage.py
CHANGED
@@ -143,6 +143,22 @@ class StoreType(enum.Enum):
|
|
143
143
|
|
144
144
|
raise ValueError(f'Unsupported cloud for StoreType: {cloud}')
|
145
145
|
|
146
|
+
def to_cloud(self) -> str:
|
147
|
+
if self == StoreType.S3:
|
148
|
+
return str(clouds.AWS())
|
149
|
+
elif self == StoreType.GCS:
|
150
|
+
return str(clouds.GCP())
|
151
|
+
elif self == StoreType.AZURE:
|
152
|
+
return str(clouds.Azure())
|
153
|
+
elif self == StoreType.R2:
|
154
|
+
return cloudflare.NAME
|
155
|
+
elif self == StoreType.IBM:
|
156
|
+
return str(clouds.IBM())
|
157
|
+
elif self == StoreType.OCI:
|
158
|
+
return str(clouds.OCI())
|
159
|
+
else:
|
160
|
+
raise ValueError(f'Unknown store type: {self}')
|
161
|
+
|
146
162
|
@classmethod
|
147
163
|
def from_store(cls, store: 'AbstractStore') -> 'StoreType':
|
148
164
|
if isinstance(store, S3Store):
|
sky/exceptions.py
CHANGED
@@ -28,12 +28,19 @@ GIT_FATAL_EXIT_CODE = 128
|
|
28
28
|
ARCH_NOT_SUPPORTED_EXIT_CODE = 133
|
29
29
|
|
30
30
|
|
31
|
-
def is_safe_exception(exc:
|
31
|
+
def is_safe_exception(exc: BaseException) -> bool:
|
32
32
|
"""Returns True if the exception is safe to send to clients.
|
33
33
|
|
34
34
|
Safe exceptions are:
|
35
35
|
1. Built-in exceptions
|
36
36
|
2. SkyPilot's own exceptions
|
37
|
+
|
38
|
+
Args:
|
39
|
+
exc: The exception to check, accept BaseException to handle SystemExit
|
40
|
+
and KeyboardInterrupt.
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
True if the exception is safe to send to clients, False otherwise.
|
37
44
|
"""
|
38
45
|
module = type(exc).__module__
|
39
46
|
|
@@ -48,7 +55,7 @@ def is_safe_exception(exc: Exception) -> bool:
|
|
48
55
|
return False
|
49
56
|
|
50
57
|
|
51
|
-
def wrap_exception(exc:
|
58
|
+
def wrap_exception(exc: BaseException) -> BaseException:
|
52
59
|
"""Wraps non-safe exceptions into SkyPilot exceptions
|
53
60
|
|
54
61
|
This is used to wrap exceptions that are not safe to deserialize at clients.
|
@@ -64,7 +71,8 @@ def wrap_exception(exc: Exception) -> Exception:
|
|
64
71
|
error_type=type(exc).__name__)
|
65
72
|
|
66
73
|
|
67
|
-
|
74
|
+
# Accept BaseException to handle SystemExit and KeyboardInterrupt
|
75
|
+
def serialize_exception(e: BaseException) -> Dict[str, Any]:
|
68
76
|
"""Serialize the exception.
|
69
77
|
|
70
78
|
This function also wraps any unsafe exceptions (e.g., cloud exceptions)
|
@@ -717,6 +717,15 @@ def check_instance_fits(context: Optional[str],
|
|
717
717
|
fits, reason = check_tpu_fits(k8s_instance_type, gpu_nodes)
|
718
718
|
if reason is not None:
|
719
719
|
return fits, reason
|
720
|
+
else:
|
721
|
+
# Check if any of the GPU nodes have sufficient number of GPUs.
|
722
|
+
gpu_nodes = [
|
723
|
+
node for node in gpu_nodes if
|
724
|
+
get_node_accelerator_count(node.status.allocatable) >= acc_count
|
725
|
+
]
|
726
|
+
if not gpu_nodes:
|
727
|
+
return False, (
|
728
|
+
f'No GPU nodes found with {acc_count} or more GPUs.')
|
720
729
|
|
721
730
|
candidate_nodes = gpu_nodes
|
722
731
|
not_fit_reason_prefix = (
|
@@ -853,7 +862,7 @@ def get_accelerator_label_key_value(
|
|
853
862
|
for label, value in label_list:
|
854
863
|
if (label_formatter.match_label_key(label) and
|
855
864
|
label_formatter.get_accelerator_from_label_value(
|
856
|
-
value) == acc_type):
|
865
|
+
value).lower() == acc_type.lower()):
|
857
866
|
if is_tpu_on_gke(acc_type):
|
858
867
|
assert isinstance(label_formatter,
|
859
868
|
GKELabelFormatter)
|
sky/server/common.py
CHANGED
@@ -428,3 +428,19 @@ def reload_for_new_request(client_entrypoint: Optional[str],
|
|
428
428
|
# necessary because the logger is initialized before the environment
|
429
429
|
# variables are set, such as SKYPILOT_DEBUG.
|
430
430
|
sky_logging.reload_logger()
|
431
|
+
|
432
|
+
|
433
|
+
def clear_local_api_server_database() -> None:
|
434
|
+
"""Removes the local API server database.
|
435
|
+
|
436
|
+
The CLI can call this during cleanup of a local API server, or the API
|
437
|
+
server can call it during startup.
|
438
|
+
"""
|
439
|
+
# Remove the database for requests including any files starting with
|
440
|
+
# api.constants.API_SERVER_REQUEST_DB_PATH
|
441
|
+
db_path = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
|
442
|
+
for extension in ['', '-shm', '-wal']:
|
443
|
+
try:
|
444
|
+
os.remove(f'{db_path}{extension}')
|
445
|
+
except FileNotFoundError:
|
446
|
+
logger.debug(f'Database file {db_path}{extension} not found.')
|
@@ -0,0 +1,31 @@
|
|
1
|
+
"""Executor event loop to process tasks in coroutines."""
|
2
|
+
import asyncio
|
3
|
+
import concurrent.futures
|
4
|
+
import threading
|
5
|
+
from typing import Coroutine, Optional
|
6
|
+
|
7
|
+
# Dedicated event loop for requests, isolated with the event loop managed
|
8
|
+
# by uvicorn. This is responsible for light-weight async tasks or sub-tasks,
|
9
|
+
# refer to `executor.py` for more details about cooperation between the event
|
10
|
+
# loop and executor process pool.
|
11
|
+
_EVENT_LOOP: Optional[asyncio.AbstractEventLoop] = None
|
12
|
+
_LOCK = threading.Lock()
|
13
|
+
|
14
|
+
|
15
|
+
def run(coro: Coroutine) -> concurrent.futures.Future:
|
16
|
+
"""Run a coroutine asynchronously in the request event loop."""
|
17
|
+
return asyncio.run_coroutine_threadsafe(coro, get_event_loop())
|
18
|
+
|
19
|
+
|
20
|
+
def get_event_loop() -> asyncio.AbstractEventLoop:
|
21
|
+
"""Open and get the event loop."""
|
22
|
+
global _EVENT_LOOP
|
23
|
+
if _EVENT_LOOP is not None and not _EVENT_LOOP.is_closed():
|
24
|
+
return _EVENT_LOOP
|
25
|
+
with _LOCK:
|
26
|
+
if _EVENT_LOOP is None or _EVENT_LOOP.is_closed():
|
27
|
+
_EVENT_LOOP = asyncio.new_event_loop()
|
28
|
+
loop_thread = threading.Thread(target=_EVENT_LOOP.run_forever,
|
29
|
+
daemon=True)
|
30
|
+
loop_thread.start()
|
31
|
+
return _EVENT_LOOP
|