skypilot-nightly 1.0.0.dev20250925__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +38 -14
- sky/backends/cloud_vm_ray_backend.py +151 -36
- sky/client/cli/command.py +18 -9
- sky/client/cli/table_utils.py +34 -0
- sky/client/common.py +4 -2
- sky/client/sdk.py +11 -7
- sky/client/sdk_async.py +5 -5
- sky/core.py +6 -6
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
- sky/dashboard/out/_next/static/chunks/{webpack-16ba1d7187d2e3b1.js → webpack-7340bc0f0dd8ae74.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +0 -1
- sky/global_user_state.py +57 -34
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +4 -0
- sky/jobs/server/core.py +98 -26
- sky/jobs/server/utils.py +65 -32
- sky/jobs/state.py +145 -3
- sky/jobs/utils.py +85 -7
- sky/provision/runpod/__init__.py +2 -0
- sky/schemas/api/responses.py +18 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/serve/serve_utils.py +16 -0
- sky/serve/server/core.py +1 -1
- sky/serve/server/impl.py +6 -6
- sky/server/requests/payloads.py +2 -1
- sky/server/requests/serializers/decoders.py +2 -2
- sky/server/requests/serializers/encoders.py +7 -3
- sky/setup_files/dependencies.py +1 -1
- sky/skylet/constants.py +4 -1
- sky/skylet/events.py +42 -0
- sky/skylet/job_lib.py +2 -32
- sky/skylet/log_lib.py +211 -0
- sky/skylet/log_lib.pyi +30 -1
- sky/skylet/services.py +208 -2
- sky/skylet/skylet.py +3 -0
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +8 -3
- sky/utils/db/db_utils.py +5 -1
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/kubernetes/kubernetes_deploy_utils.py +35 -12
- sky/volumes/server/core.py +1 -0
- sky/volumes/volume.py +16 -17
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +36 -36
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +74 -69
- sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +0 -16
- /sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = 'e42224b6d29bd960c0e0daa69add0fe2ad695142'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20250927'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
sky/backends/backend_utils.py
CHANGED
|
@@ -16,8 +16,8 @@ import tempfile
|
|
|
16
16
|
import threading
|
|
17
17
|
import time
|
|
18
18
|
import typing
|
|
19
|
-
from typing import (Any, Callable, Dict, List, Optional, Sequence,
|
|
20
|
-
TypeVar, Union)
|
|
19
|
+
from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
|
|
20
|
+
Set, Tuple, TypeVar, Union)
|
|
21
21
|
import uuid
|
|
22
22
|
|
|
23
23
|
import aiohttp
|
|
@@ -797,7 +797,7 @@ def write_cluster_config(
|
|
|
797
797
|
cloud=str(cloud).lower(),
|
|
798
798
|
region=region.name,
|
|
799
799
|
keys=('use_ssm',),
|
|
800
|
-
default_value=
|
|
800
|
+
default_value=None)
|
|
801
801
|
|
|
802
802
|
if use_ssm and ssh_proxy_command is not None:
|
|
803
803
|
raise exceptions.InvalidCloudConfigs(
|
|
@@ -805,15 +805,18 @@ def write_cluster_config(
|
|
|
805
805
|
f'is already set to {ssh_proxy_command!r}. Please remove '
|
|
806
806
|
'ssh_proxy_command or set use_ssm to false.')
|
|
807
807
|
|
|
808
|
-
if
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
808
|
+
if use_internal_ips and ssh_proxy_command is None:
|
|
809
|
+
# Only if use_ssm is explicitly not set, we default to using SSM.
|
|
810
|
+
if use_ssm is None:
|
|
811
|
+
logger.warning(
|
|
812
|
+
f'{colorama.Fore.YELLOW}'
|
|
813
|
+
'use_internal_ips is set to true, '
|
|
814
|
+
'but ssh_proxy_command is not set. Defaulting to '
|
|
815
|
+
'using SSM. Specify ssh_proxy_command to use a different '
|
|
816
|
+
'https://docs.skypilot.co/en/latest/reference/config.html#'
|
|
817
|
+
f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
|
|
818
|
+
use_ssm = True
|
|
819
|
+
|
|
817
820
|
if use_ssm:
|
|
818
821
|
aws_profile = os.environ.get('AWS_PROFILE', None)
|
|
819
822
|
profile_str = f'--profile {aws_profile}' if aws_profile else ''
|
|
@@ -1223,7 +1226,6 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
|
|
|
1223
1226
|
Rather than constructing the whole byte sequence, which may be quite large,
|
|
1224
1227
|
we construct it incrementally by using hash.update() to add new bytes.
|
|
1225
1228
|
"""
|
|
1226
|
-
|
|
1227
1229
|
# Load the yaml contents so that we can directly remove keys.
|
|
1228
1230
|
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1229
1231
|
for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
|
|
@@ -3856,13 +3858,35 @@ def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
|
|
|
3856
3858
|
) from last_exception
|
|
3857
3859
|
|
|
3858
3860
|
|
|
3861
|
+
def invoke_skylet_streaming_with_retries(
|
|
3862
|
+
stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
|
|
3863
|
+
"""Generic helper for making Skylet streaming gRPC requests."""
|
|
3864
|
+
max_attempts = 3
|
|
3865
|
+
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3866
|
+
last_exception: Optional[Exception] = None
|
|
3867
|
+
|
|
3868
|
+
for _ in range(max_attempts):
|
|
3869
|
+
try:
|
|
3870
|
+
for response in stream_func():
|
|
3871
|
+
yield response
|
|
3872
|
+
return
|
|
3873
|
+
except grpc.RpcError as e:
|
|
3874
|
+
last_exception = e
|
|
3875
|
+
_handle_grpc_error(e, backoff.current_backoff())
|
|
3876
|
+
|
|
3877
|
+
raise RuntimeError(
|
|
3878
|
+
f'Failed to stream Skylet response after {max_attempts} attempts'
|
|
3879
|
+
) from last_exception
|
|
3880
|
+
|
|
3881
|
+
|
|
3859
3882
|
def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
|
|
3860
3883
|
if e.code() == grpc.StatusCode.INTERNAL:
|
|
3861
3884
|
with ux_utils.print_exception_no_traceback():
|
|
3862
3885
|
raise exceptions.SkyletInternalError(e.details())
|
|
3863
3886
|
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3864
3887
|
time.sleep(current_backoff)
|
|
3865
|
-
elif e.code() == grpc.StatusCode.UNIMPLEMENTED
|
|
3888
|
+
elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
|
|
3889
|
+
) == grpc.StatusCode.UNKNOWN:
|
|
3866
3890
|
# Handle backwards compatibility: old server doesn't implement this RPC.
|
|
3867
3891
|
# Let the caller fall back to legacy execution.
|
|
3868
3892
|
raise exceptions.SkyletMethodNotImplementedError(
|
|
@@ -19,8 +19,8 @@ import textwrap
|
|
|
19
19
|
import threading
|
|
20
20
|
import time
|
|
21
21
|
import typing
|
|
22
|
-
from typing import (Any, Callable, Dict, Iterable, List, Optional,
|
|
23
|
-
Union)
|
|
22
|
+
from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
|
|
23
|
+
Set, Tuple, Union)
|
|
24
24
|
|
|
25
25
|
import colorama
|
|
26
26
|
import psutil
|
|
@@ -91,6 +91,8 @@ if typing.TYPE_CHECKING:
|
|
|
91
91
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
92
92
|
from sky.schemas.generated import jobsv1_pb2
|
|
93
93
|
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
94
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
95
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
94
96
|
from sky.schemas.generated import servev1_pb2
|
|
95
97
|
from sky.schemas.generated import servev1_pb2_grpc
|
|
96
98
|
else:
|
|
@@ -111,6 +113,10 @@ else:
|
|
|
111
113
|
'sky.schemas.generated.servev1_pb2')
|
|
112
114
|
servev1_pb2_grpc = adaptors_common.LazyImport(
|
|
113
115
|
'sky.schemas.generated.servev1_pb2_grpc')
|
|
116
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
117
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
118
|
+
managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
119
|
+
'sky.schemas.generated.managed_jobsv1_pb2_grpc')
|
|
114
120
|
|
|
115
121
|
Path = str
|
|
116
122
|
|
|
@@ -2737,6 +2743,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2737
2743
|
(tunnel.port, tunnel.pid) if tunnel is not None else None)
|
|
2738
2744
|
|
|
2739
2745
|
def get_grpc_channel(self) -> 'grpc.Channel':
|
|
2746
|
+
grpc_options = [
|
|
2747
|
+
# The task YAMLs can be large, so the default
|
|
2748
|
+
# max_receive_message_length of 4MB might not be enough.
|
|
2749
|
+
('grpc.max_receive_message_length', -1),
|
|
2750
|
+
]
|
|
2740
2751
|
# It's fine to not grab the lock here, as we're only reading,
|
|
2741
2752
|
# and writes are very rare.
|
|
2742
2753
|
# It's acceptable to read while another process is opening a tunnel,
|
|
@@ -2753,7 +2764,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2753
2764
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
2754
2765
|
s.settimeout(0.5)
|
|
2755
2766
|
s.connect(('localhost', tunnel.port))
|
|
2756
|
-
return grpc.insecure_channel(f'localhost:{tunnel.port}'
|
|
2767
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2768
|
+
options=grpc_options)
|
|
2757
2769
|
except socket.error as e:
|
|
2758
2770
|
logger.warning(
|
|
2759
2771
|
'Failed to connect to SSH tunnel for cluster '
|
|
@@ -2772,19 +2784,22 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2772
2784
|
f'{self.cluster_name!r}, '
|
|
2773
2785
|
'opening the tunnel')
|
|
2774
2786
|
tunnel = self._open_and_update_skylet_tunnel()
|
|
2775
|
-
return grpc.insecure_channel(f'localhost:{tunnel.port}'
|
|
2787
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2788
|
+
options=grpc_options)
|
|
2776
2789
|
try:
|
|
2777
2790
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
2778
2791
|
s.settimeout(0.5)
|
|
2779
2792
|
s.connect(('localhost', tunnel.port))
|
|
2780
|
-
return grpc.insecure_channel(f'localhost:{tunnel.port}'
|
|
2793
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2794
|
+
options=grpc_options)
|
|
2781
2795
|
except socket.error as e:
|
|
2782
2796
|
logger.warning(
|
|
2783
2797
|
'Failed to connect to SSH tunnel for cluster '
|
|
2784
2798
|
f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
|
|
2785
2799
|
'opening new tunnel')
|
|
2786
2800
|
tunnel = self._open_and_update_skylet_tunnel()
|
|
2787
|
-
return grpc.insecure_channel(f'localhost:{tunnel.port}'
|
|
2801
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2802
|
+
options=grpc_options)
|
|
2788
2803
|
except locks.LockTimeout as e:
|
|
2789
2804
|
raise RuntimeError(
|
|
2790
2805
|
'Failed to get gRPC channel for cluster '
|
|
@@ -3060,6 +3075,8 @@ class SkyletClient:
|
|
|
3060
3075
|
self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
|
|
3061
3076
|
self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
|
|
3062
3077
|
self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
|
|
3078
|
+
self._managed_jobs_stub = (
|
|
3079
|
+
managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
|
|
3063
3080
|
|
|
3064
3081
|
def set_autostop(
|
|
3065
3082
|
self,
|
|
@@ -3146,6 +3163,13 @@ class SkyletClient:
|
|
|
3146
3163
|
) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
|
|
3147
3164
|
return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
|
|
3148
3165
|
|
|
3166
|
+
def tail_logs(
|
|
3167
|
+
self,
|
|
3168
|
+
request: 'jobsv1_pb2.TailLogsRequest',
|
|
3169
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3170
|
+
) -> Iterator['jobsv1_pb2.TailLogsResponse']:
|
|
3171
|
+
return self._jobs_stub.TailLogs(request, timeout=timeout)
|
|
3172
|
+
|
|
3149
3173
|
def get_service_status(
|
|
3150
3174
|
self,
|
|
3151
3175
|
request: 'servev1_pb2.GetServiceStatusRequest',
|
|
@@ -3194,6 +3218,35 @@ class SkyletClient:
|
|
|
3194
3218
|
) -> 'servev1_pb2.UpdateServiceResponse':
|
|
3195
3219
|
return self._serve_stub.UpdateService(request, timeout=timeout)
|
|
3196
3220
|
|
|
3221
|
+
def get_managed_job_controller_version(
|
|
3222
|
+
self,
|
|
3223
|
+
request: 'managed_jobsv1_pb2.GetVersionRequest',
|
|
3224
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3225
|
+
) -> 'managed_jobsv1_pb2.GetVersionResponse':
|
|
3226
|
+
return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
|
|
3227
|
+
|
|
3228
|
+
def get_managed_job_table(
|
|
3229
|
+
self,
|
|
3230
|
+
request: 'managed_jobsv1_pb2.GetJobTableRequest',
|
|
3231
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3232
|
+
) -> 'managed_jobsv1_pb2.GetJobTableResponse':
|
|
3233
|
+
return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
|
|
3234
|
+
|
|
3235
|
+
def get_all_managed_job_ids_by_name(
|
|
3236
|
+
self,
|
|
3237
|
+
request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
|
|
3238
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3239
|
+
) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
|
|
3240
|
+
return self._managed_jobs_stub.GetAllJobIdsByName(request,
|
|
3241
|
+
timeout=timeout)
|
|
3242
|
+
|
|
3243
|
+
def cancel_managed_jobs(
|
|
3244
|
+
self,
|
|
3245
|
+
request: 'managed_jobsv1_pb2.CancelJobsRequest',
|
|
3246
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3247
|
+
) -> 'managed_jobsv1_pb2.CancelJobsResponse':
|
|
3248
|
+
return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
|
|
3249
|
+
|
|
3197
3250
|
|
|
3198
3251
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
|
3199
3252
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
@@ -3706,7 +3759,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3706
3759
|
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
|
3707
3760
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3708
3761
|
|
|
3709
|
-
if
|
|
3762
|
+
if not use_legacy:
|
|
3710
3763
|
try:
|
|
3711
3764
|
request = jobsv1_pb2.UpdateStatusRequest()
|
|
3712
3765
|
backend_utils.invoke_skylet_with_retries(
|
|
@@ -3730,7 +3783,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3730
3783
|
# 2. On next `sky start`, it gets reset to FAILED.
|
|
3731
3784
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3732
3785
|
|
|
3733
|
-
if
|
|
3786
|
+
if not use_legacy:
|
|
3734
3787
|
try:
|
|
3735
3788
|
fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
|
|
3736
3789
|
backend_utils.invoke_skylet_with_retries(
|
|
@@ -4165,7 +4218,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4165
4218
|
_dump_code_to_file(job_submit_cmd,
|
|
4166
4219
|
constants.PERSISTENT_RUN_SCRIPT_DIR)
|
|
4167
4220
|
|
|
4168
|
-
if
|
|
4221
|
+
if not use_legacy:
|
|
4169
4222
|
try:
|
|
4170
4223
|
managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
|
|
4171
4224
|
if managed_job_dag is not None:
|
|
@@ -4297,7 +4350,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4297
4350
|
metadata: str) -> Tuple[int, str]:
|
|
4298
4351
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4299
4352
|
|
|
4300
|
-
if
|
|
4353
|
+
if not use_legacy:
|
|
4301
4354
|
try:
|
|
4302
4355
|
request = jobsv1_pb2.AddJobRequest(
|
|
4303
4356
|
job_name=job_name,
|
|
@@ -4567,7 +4620,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4567
4620
|
"""
|
|
4568
4621
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4569
4622
|
|
|
4570
|
-
if
|
|
4623
|
+
if not use_legacy:
|
|
4571
4624
|
try:
|
|
4572
4625
|
request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
|
|
4573
4626
|
cancel_all=cancel_all,
|
|
@@ -4610,7 +4663,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4610
4663
|
job_to_dir: Dict[str, str] = {}
|
|
4611
4664
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4612
4665
|
|
|
4613
|
-
if
|
|
4666
|
+
if not use_legacy:
|
|
4614
4667
|
try:
|
|
4615
4668
|
int_job_ids = []
|
|
4616
4669
|
if job_ids:
|
|
@@ -4724,6 +4777,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4724
4777
|
The exit code of the tail command. Returns code 100 if the job has
|
|
4725
4778
|
failed. See exceptions.JobExitCode for possible return codes.
|
|
4726
4779
|
"""
|
|
4780
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4781
|
+
last_exit_code = 0
|
|
4782
|
+
try:
|
|
4783
|
+
request = jobsv1_pb2.TailLogsRequest(
|
|
4784
|
+
job_id=job_id,
|
|
4785
|
+
managed_job_id=managed_job_id,
|
|
4786
|
+
follow=follow,
|
|
4787
|
+
tail=tail)
|
|
4788
|
+
for resp in backend_utils.invoke_skylet_streaming_with_retries(
|
|
4789
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4790
|
+
).tail_logs(request, timeout=None)):
|
|
4791
|
+
if resp.log_line:
|
|
4792
|
+
print(resp.log_line, end='', flush=True)
|
|
4793
|
+
last_exit_code = resp.exit_code
|
|
4794
|
+
return last_exit_code
|
|
4795
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4796
|
+
pass
|
|
4797
|
+
except grpc.RpcError as e:
|
|
4798
|
+
if e.code() == grpc.StatusCode.CANCELLED:
|
|
4799
|
+
return last_exit_code
|
|
4800
|
+
raise e
|
|
4801
|
+
|
|
4727
4802
|
code = job_lib.JobLibCodeGen.tail_logs(job_id,
|
|
4728
4803
|
managed_job_id=managed_job_id,
|
|
4729
4804
|
follow=follow,
|
|
@@ -4761,6 +4836,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4761
4836
|
tail: Optional[int] = None) -> int:
|
|
4762
4837
|
# if job_name is not None, job_id should be None
|
|
4763
4838
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
4839
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
4764
4840
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4765
4841
|
job_name, job_id, follow, controller, tail)
|
|
4766
4842
|
|
|
@@ -4806,20 +4882,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4806
4882
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
4807
4883
|
|
|
4808
4884
|
if job_id is None:
|
|
4809
|
-
#
|
|
4885
|
+
# get the job_id
|
|
4810
4886
|
# if job_name is None, get all job_ids
|
|
4811
4887
|
# TODO: Only get the latest job_id, since that's the only one we use
|
|
4812
|
-
|
|
4813
|
-
|
|
4814
|
-
|
|
4815
|
-
|
|
4816
|
-
|
|
4817
|
-
|
|
4818
|
-
|
|
4819
|
-
|
|
4820
|
-
|
|
4821
|
-
|
|
4822
|
-
|
|
4888
|
+
|
|
4889
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4890
|
+
logger.info(f'handle.is_grpc_enabled_with_flag: '
|
|
4891
|
+
f'{handle.is_grpc_enabled_with_flag}')
|
|
4892
|
+
if not use_legacy:
|
|
4893
|
+
try:
|
|
4894
|
+
request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
|
|
4895
|
+
job_name=job_name)
|
|
4896
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4897
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4898
|
+
)).get_all_managed_job_ids_by_name(request))
|
|
4899
|
+
job_ids = list(response.job_ids)
|
|
4900
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4901
|
+
use_legacy = True
|
|
4902
|
+
|
|
4903
|
+
if use_legacy:
|
|
4904
|
+
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
|
4905
|
+
job_name=job_name)
|
|
4906
|
+
returncode, job_ids_payload, stderr = self.run_on_head(
|
|
4907
|
+
handle,
|
|
4908
|
+
code,
|
|
4909
|
+
stream_logs=False,
|
|
4910
|
+
require_outputs=True,
|
|
4911
|
+
separate_stderr=True)
|
|
4912
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4913
|
+
'Failed to sync down logs.',
|
|
4914
|
+
stderr)
|
|
4915
|
+
job_ids = message_utils.decode_payload(job_ids_payload)
|
|
4823
4916
|
if not job_ids:
|
|
4824
4917
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4825
4918
|
'No matching job found'
|
|
@@ -4847,18 +4940,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4847
4940
|
else:
|
|
4848
4941
|
# get the run_timestamp
|
|
4849
4942
|
# the function takes in [job_id]
|
|
4850
|
-
|
|
4851
|
-
|
|
4852
|
-
|
|
4853
|
-
|
|
4854
|
-
|
|
4855
|
-
|
|
4856
|
-
|
|
4857
|
-
|
|
4858
|
-
|
|
4859
|
-
|
|
4860
|
-
|
|
4861
|
-
|
|
4943
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4944
|
+
if not use_legacy:
|
|
4945
|
+
try:
|
|
4946
|
+
log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
4947
|
+
job_ids=[job_id])
|
|
4948
|
+
log_dirs_response = (
|
|
4949
|
+
backend_utils.invoke_skylet_with_retries(
|
|
4950
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4951
|
+
)).get_log_dirs_for_jobs(log_dirs_request)))
|
|
4952
|
+
job_log_dirs = log_dirs_response.job_log_dirs
|
|
4953
|
+
# Convert back to the expected format
|
|
4954
|
+
# {job_id: run_timestamp}
|
|
4955
|
+
run_timestamps = {}
|
|
4956
|
+
for jid, log_dir in job_log_dirs.items():
|
|
4957
|
+
run_timestamps[int(jid)] = log_dir
|
|
4958
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4959
|
+
use_legacy = True
|
|
4960
|
+
|
|
4961
|
+
if use_legacy:
|
|
4962
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
|
|
4963
|
+
[str(job_id)])
|
|
4964
|
+
returncode, run_timestamps_payload, stderr = self.run_on_head(
|
|
4965
|
+
handle,
|
|
4966
|
+
code,
|
|
4967
|
+
stream_logs=False,
|
|
4968
|
+
require_outputs=True,
|
|
4969
|
+
separate_stderr=True)
|
|
4970
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4971
|
+
'Failed to sync logs.',
|
|
4972
|
+
stderr)
|
|
4973
|
+
# returns with a dict of {job_id: run_timestamp}
|
|
4974
|
+
run_timestamps = message_utils.decode_payload(
|
|
4975
|
+
run_timestamps_payload)
|
|
4862
4976
|
if not run_timestamps:
|
|
4863
4977
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4864
4978
|
'No matching log directories found'
|
|
@@ -4925,6 +5039,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4925
5039
|
exist_ok=True)
|
|
4926
5040
|
log_file = os.path.join(local_log_dir, 'run.log')
|
|
4927
5041
|
|
|
5042
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
4928
5043
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4929
5044
|
job_name=None,
|
|
4930
5045
|
job_id=int(job_id),
|
sky/client/cli/command.py
CHANGED
|
@@ -59,6 +59,7 @@ from sky import task as task_lib
|
|
|
59
59
|
from sky.adaptors import common as adaptors_common
|
|
60
60
|
from sky.client import sdk
|
|
61
61
|
from sky.client.cli import flags
|
|
62
|
+
from sky.client.cli import table_utils
|
|
62
63
|
from sky.data import storage_utils
|
|
63
64
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
64
65
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
@@ -2125,7 +2126,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
|
2125
2126
|
f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
|
|
2126
2127
|
f' {common_utils.format_exception(e)}')
|
|
2127
2128
|
return
|
|
2128
|
-
job_tables[cluster] =
|
|
2129
|
+
job_tables[cluster] = table_utils.format_job_queue(job_table)
|
|
2129
2130
|
|
|
2130
2131
|
subprocess_utils.run_in_parallel(_get_job_queue, clusters)
|
|
2131
2132
|
user_str = 'all users' if all_users else 'current user'
|
|
@@ -5906,23 +5907,31 @@ def local():
|
|
|
5906
5907
|
required=False,
|
|
5907
5908
|
help='Name to use for the kubeconfig context. Defaults to "default". '
|
|
5908
5909
|
'Used with the ip list.')
|
|
5909
|
-
@click.option(
|
|
5910
|
-
'--name',
|
|
5911
|
-
type=str,
|
|
5912
|
-
required=False,
|
|
5913
|
-
help='Name of the cluster. Defaults to "skypilot". Used without ip list.')
|
|
5914
5910
|
@click.option('--password',
|
|
5915
5911
|
type=str,
|
|
5916
5912
|
required=False,
|
|
5917
5913
|
help='Password for the ssh-user to execute sudo commands. '
|
|
5918
5914
|
'Required only if passwordless sudo is not setup.')
|
|
5915
|
+
@click.option(
|
|
5916
|
+
'--name',
|
|
5917
|
+
type=str,
|
|
5918
|
+
required=False,
|
|
5919
|
+
help='Name of the cluster. Defaults to "skypilot". Used without ip list.')
|
|
5920
|
+
@click.option(
|
|
5921
|
+
'--port-start',
|
|
5922
|
+
type=int,
|
|
5923
|
+
required=False,
|
|
5924
|
+
help='Starting port range for the local kind cluster. Needs to be a '
|
|
5925
|
+
'multiple of 100. If not given, a random range will be used. '
|
|
5926
|
+
'Used without ip list.')
|
|
5919
5927
|
@local.command('up', cls=_DocumentedCodeCommand)
|
|
5920
5928
|
@flags.config_option(expose_value=False)
|
|
5921
5929
|
@_add_click_options(flags.COMMON_OPTIONS)
|
|
5922
5930
|
@usage_lib.entrypoint
|
|
5923
5931
|
def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
|
|
5924
|
-
cleanup: bool, context_name: Optional[str],
|
|
5925
|
-
password: Optional[str],
|
|
5932
|
+
cleanup: bool, context_name: Optional[str],
|
|
5933
|
+
password: Optional[str], name: Optional[str],
|
|
5934
|
+
port_start: Optional[int], async_call: bool):
|
|
5926
5935
|
"""Creates a local or remote cluster."""
|
|
5927
5936
|
|
|
5928
5937
|
def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
|
|
@@ -5968,7 +5977,7 @@ def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
|
|
|
5968
5977
|
f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
|
|
5969
5978
|
|
|
5970
5979
|
request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup,
|
|
5971
|
-
context_name, name,
|
|
5980
|
+
context_name, password, name, port_start)
|
|
5972
5981
|
_async_call_or_wait(request_id, async_call, request_name='local up')
|
|
5973
5982
|
|
|
5974
5983
|
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Utilities for formatting tables for CLI output."""
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from sky.schemas.api import responses
|
|
5
|
+
from sky.utils import log_utils
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def format_job_queue(jobs: List[responses.ClusterJobRecord]):
|
|
9
|
+
"""Format the job queue for display.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
jobs = get_job_queue()
|
|
13
|
+
print(format_job_queue(jobs))
|
|
14
|
+
"""
|
|
15
|
+
job_table = log_utils.create_table([
|
|
16
|
+
'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
|
|
17
|
+
'STATUS', 'LOG', 'GIT COMMIT'
|
|
18
|
+
])
|
|
19
|
+
for job in jobs:
|
|
20
|
+
job_table.add_row([
|
|
21
|
+
job.job_id,
|
|
22
|
+
job.job_name,
|
|
23
|
+
job.username,
|
|
24
|
+
log_utils.readable_time_duration(job.submitted_at),
|
|
25
|
+
log_utils.readable_time_duration(job.start_at),
|
|
26
|
+
log_utils.readable_time_duration(job.start_at,
|
|
27
|
+
job.end_at,
|
|
28
|
+
absolute=True),
|
|
29
|
+
job.resources,
|
|
30
|
+
job.status.colored_str(),
|
|
31
|
+
job.log_path,
|
|
32
|
+
job.metadata.get('git_commit', '-'),
|
|
33
|
+
])
|
|
34
|
+
return job_table
|
sky/client/common.py
CHANGED
|
@@ -44,8 +44,10 @@ logger = sky_logging.init_logger(__name__)
|
|
|
44
44
|
_DOWNLOAD_CHUNK_BYTES = 8192
|
|
45
45
|
# The chunk size for the zip file to be uploaded to the API server. We split
|
|
46
46
|
# the zip file into chunks to avoid network issues for large request body that
|
|
47
|
-
# can be caused by NGINX's client_max_body_size.
|
|
48
|
-
|
|
47
|
+
# can be caused by NGINX's client_max_body_size or Cloudflare's upload limit.
|
|
48
|
+
# As of 09/25/2025, the upload limit for Cloudflare's free plan is 100MiB:
|
|
49
|
+
# https://developers.cloudflare.com/support/troubleshooting/http-status-codes/4xx-client-error/error-413/
|
|
50
|
+
_UPLOAD_CHUNK_BYTES = 100 * 1024 * 1024
|
|
49
51
|
|
|
50
52
|
FILE_UPLOAD_LOGS_DIR = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
51
53
|
'file_uploads')
|
sky/client/sdk.py
CHANGED
|
@@ -1267,9 +1267,11 @@ def autostop(
|
|
|
1267
1267
|
@usage_lib.entrypoint
|
|
1268
1268
|
@server_common.check_server_healthy_or_start
|
|
1269
1269
|
@annotations.client_api
|
|
1270
|
-
def queue(
|
|
1271
|
-
|
|
1272
|
-
|
|
1270
|
+
def queue(
|
|
1271
|
+
cluster_name: str,
|
|
1272
|
+
skip_finished: bool = False,
|
|
1273
|
+
all_users: bool = False
|
|
1274
|
+
) -> server_common.RequestId[List[responses.ClusterJobRecord]]:
|
|
1273
1275
|
"""Gets the job queue of a cluster.
|
|
1274
1276
|
|
|
1275
1277
|
Args:
|
|
@@ -1282,8 +1284,8 @@ def queue(cluster_name: str,
|
|
|
1282
1284
|
The request ID of the queue request.
|
|
1283
1285
|
|
|
1284
1286
|
Request Returns:
|
|
1285
|
-
job_records (List[
|
|
1286
|
-
queue.
|
|
1287
|
+
job_records (List[responses.ClusterJobRecord]): A list of job records
|
|
1288
|
+
for each job in the queue.
|
|
1287
1289
|
|
|
1288
1290
|
.. code-block:: python
|
|
1289
1291
|
|
|
@@ -1677,8 +1679,9 @@ def local_up(gpus: bool,
|
|
|
1677
1679
|
ssh_key: Optional[str],
|
|
1678
1680
|
cleanup: bool,
|
|
1679
1681
|
context_name: Optional[str] = None,
|
|
1682
|
+
password: Optional[str] = None,
|
|
1680
1683
|
name: Optional[str] = None,
|
|
1681
|
-
|
|
1684
|
+
port_start: Optional[int] = None) -> server_common.RequestId[None]:
|
|
1682
1685
|
"""Launches a Kubernetes cluster on local machines.
|
|
1683
1686
|
|
|
1684
1687
|
Returns:
|
|
@@ -1698,8 +1701,9 @@ def local_up(gpus: bool,
|
|
|
1698
1701
|
ssh_key=ssh_key,
|
|
1699
1702
|
cleanup=cleanup,
|
|
1700
1703
|
context_name=context_name,
|
|
1704
|
+
password=password,
|
|
1701
1705
|
name=name,
|
|
1702
|
-
|
|
1706
|
+
port_start=port_start)
|
|
1703
1707
|
response = server_common.make_authenticated_request(
|
|
1704
1708
|
'POST', '/local_up', json=json.loads(body.model_dump_json()))
|
|
1705
1709
|
return server_common.get_request_id(response)
|
sky/client/sdk_async.py
CHANGED
|
@@ -523,11 +523,11 @@ async def autostop(
|
|
|
523
523
|
@usage_lib.entrypoint
|
|
524
524
|
@annotations.client_api
|
|
525
525
|
async def queue(
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
) -> List[
|
|
526
|
+
cluster_name: str,
|
|
527
|
+
skip_finished: bool = False,
|
|
528
|
+
all_users: bool = False,
|
|
529
|
+
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
|
|
530
|
+
) -> List[responses.ClusterJobRecord]:
|
|
531
531
|
"""Async version of queue() that gets the job queue of a cluster."""
|
|
532
532
|
request_id = await context_utils.to_thread(sdk.queue, cluster_name,
|
|
533
533
|
skip_finished, all_users)
|