skypilot-nightly 1.0.0.dev20250926__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +25 -4
- sky/backends/cloud_vm_ray_backend.py +151 -36
- sky/client/cli/command.py +2 -1
- sky/client/cli/table_utils.py +34 -0
- sky/client/sdk.py +7 -5
- sky/client/sdk_async.py +5 -5
- sky/core.py +3 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/{webpack-8e64d11e58eab5cb.js → webpack-7340bc0f0dd8ae74.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +0 -1
- sky/global_user_state.py +3 -3
- sky/jobs/server/core.py +96 -26
- sky/jobs/server/utils.py +65 -32
- sky/jobs/state.py +145 -3
- sky/jobs/utils.py +85 -7
- sky/schemas/api/responses.py +18 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/serve/serve_utils.py +16 -0
- sky/serve/server/core.py +1 -1
- sky/serve/server/impl.py +6 -6
- sky/server/requests/serializers/decoders.py +2 -2
- sky/server/requests/serializers/encoders.py +7 -3
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +2 -32
- sky/skylet/log_lib.py +211 -0
- sky/skylet/log_lib.pyi +30 -1
- sky/skylet/services.py +208 -2
- sky/skylet/skylet.py +3 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +32 -32
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +56 -52
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
- /sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = 'e42224b6d29bd960c0e0daa69add0fe2ad695142'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20250927'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
sky/backends/backend_utils.py
CHANGED
|
@@ -16,8 +16,8 @@ import tempfile
|
|
|
16
16
|
import threading
|
|
17
17
|
import time
|
|
18
18
|
import typing
|
|
19
|
-
from typing import (Any, Callable, Dict, List, Optional, Sequence,
|
|
20
|
-
TypeVar, Union)
|
|
19
|
+
from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
|
|
20
|
+
Set, Tuple, TypeVar, Union)
|
|
21
21
|
import uuid
|
|
22
22
|
|
|
23
23
|
import aiohttp
|
|
@@ -1226,7 +1226,6 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
|
|
|
1226
1226
|
Rather than constructing the whole byte sequence, which may be quite large,
|
|
1227
1227
|
we construct it incrementally by using hash.update() to add new bytes.
|
|
1228
1228
|
"""
|
|
1229
|
-
|
|
1230
1229
|
# Load the yaml contents so that we can directly remove keys.
|
|
1231
1230
|
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1232
1231
|
for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
|
|
@@ -3859,13 +3858,35 @@ def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
|
|
|
3859
3858
|
) from last_exception
|
|
3860
3859
|
|
|
3861
3860
|
|
|
3861
|
+
def invoke_skylet_streaming_with_retries(
|
|
3862
|
+
stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
|
|
3863
|
+
"""Generic helper for making Skylet streaming gRPC requests."""
|
|
3864
|
+
max_attempts = 3
|
|
3865
|
+
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3866
|
+
last_exception: Optional[Exception] = None
|
|
3867
|
+
|
|
3868
|
+
for _ in range(max_attempts):
|
|
3869
|
+
try:
|
|
3870
|
+
for response in stream_func():
|
|
3871
|
+
yield response
|
|
3872
|
+
return
|
|
3873
|
+
except grpc.RpcError as e:
|
|
3874
|
+
last_exception = e
|
|
3875
|
+
_handle_grpc_error(e, backoff.current_backoff())
|
|
3876
|
+
|
|
3877
|
+
raise RuntimeError(
|
|
3878
|
+
f'Failed to stream Skylet response after {max_attempts} attempts'
|
|
3879
|
+
) from last_exception
|
|
3880
|
+
|
|
3881
|
+
|
|
3862
3882
|
def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
|
|
3863
3883
|
if e.code() == grpc.StatusCode.INTERNAL:
|
|
3864
3884
|
with ux_utils.print_exception_no_traceback():
|
|
3865
3885
|
raise exceptions.SkyletInternalError(e.details())
|
|
3866
3886
|
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3867
3887
|
time.sleep(current_backoff)
|
|
3868
|
-
elif e.code() == grpc.StatusCode.UNIMPLEMENTED
|
|
3888
|
+
elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
|
|
3889
|
+
) == grpc.StatusCode.UNKNOWN:
|
|
3869
3890
|
# Handle backwards compatibility: old server doesn't implement this RPC.
|
|
3870
3891
|
# Let the caller fall back to legacy execution.
|
|
3871
3892
|
raise exceptions.SkyletMethodNotImplementedError(
|
|
@@ -19,8 +19,8 @@ import textwrap
|
|
|
19
19
|
import threading
|
|
20
20
|
import time
|
|
21
21
|
import typing
|
|
22
|
-
from typing import (Any, Callable, Dict, Iterable, List, Optional,
|
|
23
|
-
Union)
|
|
22
|
+
from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
|
|
23
|
+
Set, Tuple, Union)
|
|
24
24
|
|
|
25
25
|
import colorama
|
|
26
26
|
import psutil
|
|
@@ -91,6 +91,8 @@ if typing.TYPE_CHECKING:
|
|
|
91
91
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
92
92
|
from sky.schemas.generated import jobsv1_pb2
|
|
93
93
|
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
94
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
95
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
94
96
|
from sky.schemas.generated import servev1_pb2
|
|
95
97
|
from sky.schemas.generated import servev1_pb2_grpc
|
|
96
98
|
else:
|
|
@@ -111,6 +113,10 @@ else:
|
|
|
111
113
|
'sky.schemas.generated.servev1_pb2')
|
|
112
114
|
servev1_pb2_grpc = adaptors_common.LazyImport(
|
|
113
115
|
'sky.schemas.generated.servev1_pb2_grpc')
|
|
116
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
117
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
118
|
+
managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
119
|
+
'sky.schemas.generated.managed_jobsv1_pb2_grpc')
|
|
114
120
|
|
|
115
121
|
Path = str
|
|
116
122
|
|
|
@@ -2737,6 +2743,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2737
2743
|
(tunnel.port, tunnel.pid) if tunnel is not None else None)
|
|
2738
2744
|
|
|
2739
2745
|
def get_grpc_channel(self) -> 'grpc.Channel':
|
|
2746
|
+
grpc_options = [
|
|
2747
|
+
# The task YAMLs can be large, so the default
|
|
2748
|
+
# max_receive_message_length of 4MB might not be enough.
|
|
2749
|
+
('grpc.max_receive_message_length', -1),
|
|
2750
|
+
]
|
|
2740
2751
|
# It's fine to not grab the lock here, as we're only reading,
|
|
2741
2752
|
# and writes are very rare.
|
|
2742
2753
|
# It's acceptable to read while another process is opening a tunnel,
|
|
@@ -2753,7 +2764,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2753
2764
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
2754
2765
|
s.settimeout(0.5)
|
|
2755
2766
|
s.connect(('localhost', tunnel.port))
|
|
2756
|
-
return grpc.insecure_channel(f'localhost:{tunnel.port}'
|
|
2767
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2768
|
+
options=grpc_options)
|
|
2757
2769
|
except socket.error as e:
|
|
2758
2770
|
logger.warning(
|
|
2759
2771
|
'Failed to connect to SSH tunnel for cluster '
|
|
@@ -2772,19 +2784,22 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2772
2784
|
f'{self.cluster_name!r}, '
|
|
2773
2785
|
'opening the tunnel')
|
|
2774
2786
|
tunnel = self._open_and_update_skylet_tunnel()
|
|
2775
|
-
return grpc.insecure_channel(f'localhost:{tunnel.port}'
|
|
2787
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2788
|
+
options=grpc_options)
|
|
2776
2789
|
try:
|
|
2777
2790
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
2778
2791
|
s.settimeout(0.5)
|
|
2779
2792
|
s.connect(('localhost', tunnel.port))
|
|
2780
|
-
return grpc.insecure_channel(f'localhost:{tunnel.port}'
|
|
2793
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2794
|
+
options=grpc_options)
|
|
2781
2795
|
except socket.error as e:
|
|
2782
2796
|
logger.warning(
|
|
2783
2797
|
'Failed to connect to SSH tunnel for cluster '
|
|
2784
2798
|
f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
|
|
2785
2799
|
'opening new tunnel')
|
|
2786
2800
|
tunnel = self._open_and_update_skylet_tunnel()
|
|
2787
|
-
return grpc.insecure_channel(f'localhost:{tunnel.port}'
|
|
2801
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2802
|
+
options=grpc_options)
|
|
2788
2803
|
except locks.LockTimeout as e:
|
|
2789
2804
|
raise RuntimeError(
|
|
2790
2805
|
'Failed to get gRPC channel for cluster '
|
|
@@ -3060,6 +3075,8 @@ class SkyletClient:
|
|
|
3060
3075
|
self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
|
|
3061
3076
|
self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
|
|
3062
3077
|
self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
|
|
3078
|
+
self._managed_jobs_stub = (
|
|
3079
|
+
managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
|
|
3063
3080
|
|
|
3064
3081
|
def set_autostop(
|
|
3065
3082
|
self,
|
|
@@ -3146,6 +3163,13 @@ class SkyletClient:
|
|
|
3146
3163
|
) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
|
|
3147
3164
|
return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
|
|
3148
3165
|
|
|
3166
|
+
def tail_logs(
|
|
3167
|
+
self,
|
|
3168
|
+
request: 'jobsv1_pb2.TailLogsRequest',
|
|
3169
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3170
|
+
) -> Iterator['jobsv1_pb2.TailLogsResponse']:
|
|
3171
|
+
return self._jobs_stub.TailLogs(request, timeout=timeout)
|
|
3172
|
+
|
|
3149
3173
|
def get_service_status(
|
|
3150
3174
|
self,
|
|
3151
3175
|
request: 'servev1_pb2.GetServiceStatusRequest',
|
|
@@ -3194,6 +3218,35 @@ class SkyletClient:
|
|
|
3194
3218
|
) -> 'servev1_pb2.UpdateServiceResponse':
|
|
3195
3219
|
return self._serve_stub.UpdateService(request, timeout=timeout)
|
|
3196
3220
|
|
|
3221
|
+
def get_managed_job_controller_version(
|
|
3222
|
+
self,
|
|
3223
|
+
request: 'managed_jobsv1_pb2.GetVersionRequest',
|
|
3224
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3225
|
+
) -> 'managed_jobsv1_pb2.GetVersionResponse':
|
|
3226
|
+
return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
|
|
3227
|
+
|
|
3228
|
+
def get_managed_job_table(
|
|
3229
|
+
self,
|
|
3230
|
+
request: 'managed_jobsv1_pb2.GetJobTableRequest',
|
|
3231
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3232
|
+
) -> 'managed_jobsv1_pb2.GetJobTableResponse':
|
|
3233
|
+
return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
|
|
3234
|
+
|
|
3235
|
+
def get_all_managed_job_ids_by_name(
|
|
3236
|
+
self,
|
|
3237
|
+
request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
|
|
3238
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3239
|
+
) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
|
|
3240
|
+
return self._managed_jobs_stub.GetAllJobIdsByName(request,
|
|
3241
|
+
timeout=timeout)
|
|
3242
|
+
|
|
3243
|
+
def cancel_managed_jobs(
|
|
3244
|
+
self,
|
|
3245
|
+
request: 'managed_jobsv1_pb2.CancelJobsRequest',
|
|
3246
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3247
|
+
) -> 'managed_jobsv1_pb2.CancelJobsResponse':
|
|
3248
|
+
return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
|
|
3249
|
+
|
|
3197
3250
|
|
|
3198
3251
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
|
3199
3252
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
@@ -3706,7 +3759,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3706
3759
|
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
|
3707
3760
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3708
3761
|
|
|
3709
|
-
if
|
|
3762
|
+
if not use_legacy:
|
|
3710
3763
|
try:
|
|
3711
3764
|
request = jobsv1_pb2.UpdateStatusRequest()
|
|
3712
3765
|
backend_utils.invoke_skylet_with_retries(
|
|
@@ -3730,7 +3783,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3730
3783
|
# 2. On next `sky start`, it gets reset to FAILED.
|
|
3731
3784
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3732
3785
|
|
|
3733
|
-
if
|
|
3786
|
+
if not use_legacy:
|
|
3734
3787
|
try:
|
|
3735
3788
|
fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
|
|
3736
3789
|
backend_utils.invoke_skylet_with_retries(
|
|
@@ -4165,7 +4218,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4165
4218
|
_dump_code_to_file(job_submit_cmd,
|
|
4166
4219
|
constants.PERSISTENT_RUN_SCRIPT_DIR)
|
|
4167
4220
|
|
|
4168
|
-
if
|
|
4221
|
+
if not use_legacy:
|
|
4169
4222
|
try:
|
|
4170
4223
|
managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
|
|
4171
4224
|
if managed_job_dag is not None:
|
|
@@ -4297,7 +4350,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4297
4350
|
metadata: str) -> Tuple[int, str]:
|
|
4298
4351
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4299
4352
|
|
|
4300
|
-
if
|
|
4353
|
+
if not use_legacy:
|
|
4301
4354
|
try:
|
|
4302
4355
|
request = jobsv1_pb2.AddJobRequest(
|
|
4303
4356
|
job_name=job_name,
|
|
@@ -4567,7 +4620,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4567
4620
|
"""
|
|
4568
4621
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4569
4622
|
|
|
4570
|
-
if
|
|
4623
|
+
if not use_legacy:
|
|
4571
4624
|
try:
|
|
4572
4625
|
request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
|
|
4573
4626
|
cancel_all=cancel_all,
|
|
@@ -4610,7 +4663,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4610
4663
|
job_to_dir: Dict[str, str] = {}
|
|
4611
4664
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4612
4665
|
|
|
4613
|
-
if
|
|
4666
|
+
if not use_legacy:
|
|
4614
4667
|
try:
|
|
4615
4668
|
int_job_ids = []
|
|
4616
4669
|
if job_ids:
|
|
@@ -4724,6 +4777,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4724
4777
|
The exit code of the tail command. Returns code 100 if the job has
|
|
4725
4778
|
failed. See exceptions.JobExitCode for possible return codes.
|
|
4726
4779
|
"""
|
|
4780
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4781
|
+
last_exit_code = 0
|
|
4782
|
+
try:
|
|
4783
|
+
request = jobsv1_pb2.TailLogsRequest(
|
|
4784
|
+
job_id=job_id,
|
|
4785
|
+
managed_job_id=managed_job_id,
|
|
4786
|
+
follow=follow,
|
|
4787
|
+
tail=tail)
|
|
4788
|
+
for resp in backend_utils.invoke_skylet_streaming_with_retries(
|
|
4789
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4790
|
+
).tail_logs(request, timeout=None)):
|
|
4791
|
+
if resp.log_line:
|
|
4792
|
+
print(resp.log_line, end='', flush=True)
|
|
4793
|
+
last_exit_code = resp.exit_code
|
|
4794
|
+
return last_exit_code
|
|
4795
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4796
|
+
pass
|
|
4797
|
+
except grpc.RpcError as e:
|
|
4798
|
+
if e.code() == grpc.StatusCode.CANCELLED:
|
|
4799
|
+
return last_exit_code
|
|
4800
|
+
raise e
|
|
4801
|
+
|
|
4727
4802
|
code = job_lib.JobLibCodeGen.tail_logs(job_id,
|
|
4728
4803
|
managed_job_id=managed_job_id,
|
|
4729
4804
|
follow=follow,
|
|
@@ -4761,6 +4836,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4761
4836
|
tail: Optional[int] = None) -> int:
|
|
4762
4837
|
# if job_name is not None, job_id should be None
|
|
4763
4838
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
4839
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
4764
4840
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4765
4841
|
job_name, job_id, follow, controller, tail)
|
|
4766
4842
|
|
|
@@ -4806,20 +4882,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4806
4882
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
4807
4883
|
|
|
4808
4884
|
if job_id is None:
|
|
4809
|
-
#
|
|
4885
|
+
# get the job_id
|
|
4810
4886
|
# if job_name is None, get all job_ids
|
|
4811
4887
|
# TODO: Only get the latest job_id, since that's the only one we use
|
|
4812
|
-
|
|
4813
|
-
|
|
4814
|
-
|
|
4815
|
-
|
|
4816
|
-
|
|
4817
|
-
|
|
4818
|
-
|
|
4819
|
-
|
|
4820
|
-
|
|
4821
|
-
|
|
4822
|
-
|
|
4888
|
+
|
|
4889
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4890
|
+
logger.info(f'handle.is_grpc_enabled_with_flag: '
|
|
4891
|
+
f'{handle.is_grpc_enabled_with_flag}')
|
|
4892
|
+
if not use_legacy:
|
|
4893
|
+
try:
|
|
4894
|
+
request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
|
|
4895
|
+
job_name=job_name)
|
|
4896
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4897
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4898
|
+
)).get_all_managed_job_ids_by_name(request))
|
|
4899
|
+
job_ids = list(response.job_ids)
|
|
4900
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4901
|
+
use_legacy = True
|
|
4902
|
+
|
|
4903
|
+
if use_legacy:
|
|
4904
|
+
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
|
4905
|
+
job_name=job_name)
|
|
4906
|
+
returncode, job_ids_payload, stderr = self.run_on_head(
|
|
4907
|
+
handle,
|
|
4908
|
+
code,
|
|
4909
|
+
stream_logs=False,
|
|
4910
|
+
require_outputs=True,
|
|
4911
|
+
separate_stderr=True)
|
|
4912
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4913
|
+
'Failed to sync down logs.',
|
|
4914
|
+
stderr)
|
|
4915
|
+
job_ids = message_utils.decode_payload(job_ids_payload)
|
|
4823
4916
|
if not job_ids:
|
|
4824
4917
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4825
4918
|
'No matching job found'
|
|
@@ -4847,18 +4940,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4847
4940
|
else:
|
|
4848
4941
|
# get the run_timestamp
|
|
4849
4942
|
# the function takes in [job_id]
|
|
4850
|
-
|
|
4851
|
-
|
|
4852
|
-
|
|
4853
|
-
|
|
4854
|
-
|
|
4855
|
-
|
|
4856
|
-
|
|
4857
|
-
|
|
4858
|
-
|
|
4859
|
-
|
|
4860
|
-
|
|
4861
|
-
|
|
4943
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4944
|
+
if not use_legacy:
|
|
4945
|
+
try:
|
|
4946
|
+
log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
4947
|
+
job_ids=[job_id])
|
|
4948
|
+
log_dirs_response = (
|
|
4949
|
+
backend_utils.invoke_skylet_with_retries(
|
|
4950
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4951
|
+
)).get_log_dirs_for_jobs(log_dirs_request)))
|
|
4952
|
+
job_log_dirs = log_dirs_response.job_log_dirs
|
|
4953
|
+
# Convert back to the expected format
|
|
4954
|
+
# {job_id: run_timestamp}
|
|
4955
|
+
run_timestamps = {}
|
|
4956
|
+
for jid, log_dir in job_log_dirs.items():
|
|
4957
|
+
run_timestamps[int(jid)] = log_dir
|
|
4958
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4959
|
+
use_legacy = True
|
|
4960
|
+
|
|
4961
|
+
if use_legacy:
|
|
4962
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
|
|
4963
|
+
[str(job_id)])
|
|
4964
|
+
returncode, run_timestamps_payload, stderr = self.run_on_head(
|
|
4965
|
+
handle,
|
|
4966
|
+
code,
|
|
4967
|
+
stream_logs=False,
|
|
4968
|
+
require_outputs=True,
|
|
4969
|
+
separate_stderr=True)
|
|
4970
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4971
|
+
'Failed to sync logs.',
|
|
4972
|
+
stderr)
|
|
4973
|
+
# returns with a dict of {job_id: run_timestamp}
|
|
4974
|
+
run_timestamps = message_utils.decode_payload(
|
|
4975
|
+
run_timestamps_payload)
|
|
4862
4976
|
if not run_timestamps:
|
|
4863
4977
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4864
4978
|
'No matching log directories found'
|
|
@@ -4925,6 +5039,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4925
5039
|
exist_ok=True)
|
|
4926
5040
|
log_file = os.path.join(local_log_dir, 'run.log')
|
|
4927
5041
|
|
|
5042
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
4928
5043
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4929
5044
|
job_name=None,
|
|
4930
5045
|
job_id=int(job_id),
|
sky/client/cli/command.py
CHANGED
|
@@ -59,6 +59,7 @@ from sky import task as task_lib
|
|
|
59
59
|
from sky.adaptors import common as adaptors_common
|
|
60
60
|
from sky.client import sdk
|
|
61
61
|
from sky.client.cli import flags
|
|
62
|
+
from sky.client.cli import table_utils
|
|
62
63
|
from sky.data import storage_utils
|
|
63
64
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
64
65
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
@@ -2125,7 +2126,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
|
2125
2126
|
f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
|
|
2126
2127
|
f' {common_utils.format_exception(e)}')
|
|
2127
2128
|
return
|
|
2128
|
-
job_tables[cluster] =
|
|
2129
|
+
job_tables[cluster] = table_utils.format_job_queue(job_table)
|
|
2129
2130
|
|
|
2130
2131
|
subprocess_utils.run_in_parallel(_get_job_queue, clusters)
|
|
2131
2132
|
user_str = 'all users' if all_users else 'current user'
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Utilities for formatting tables for CLI output."""
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from sky.schemas.api import responses
|
|
5
|
+
from sky.utils import log_utils
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def format_job_queue(jobs: List[responses.ClusterJobRecord]):
|
|
9
|
+
"""Format the job queue for display.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
jobs = get_job_queue()
|
|
13
|
+
print(format_job_queue(jobs))
|
|
14
|
+
"""
|
|
15
|
+
job_table = log_utils.create_table([
|
|
16
|
+
'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
|
|
17
|
+
'STATUS', 'LOG', 'GIT COMMIT'
|
|
18
|
+
])
|
|
19
|
+
for job in jobs:
|
|
20
|
+
job_table.add_row([
|
|
21
|
+
job.job_id,
|
|
22
|
+
job.job_name,
|
|
23
|
+
job.username,
|
|
24
|
+
log_utils.readable_time_duration(job.submitted_at),
|
|
25
|
+
log_utils.readable_time_duration(job.start_at),
|
|
26
|
+
log_utils.readable_time_duration(job.start_at,
|
|
27
|
+
job.end_at,
|
|
28
|
+
absolute=True),
|
|
29
|
+
job.resources,
|
|
30
|
+
job.status.colored_str(),
|
|
31
|
+
job.log_path,
|
|
32
|
+
job.metadata.get('git_commit', '-'),
|
|
33
|
+
])
|
|
34
|
+
return job_table
|
sky/client/sdk.py
CHANGED
|
@@ -1267,9 +1267,11 @@ def autostop(
|
|
|
1267
1267
|
@usage_lib.entrypoint
|
|
1268
1268
|
@server_common.check_server_healthy_or_start
|
|
1269
1269
|
@annotations.client_api
|
|
1270
|
-
def queue(
|
|
1271
|
-
|
|
1272
|
-
|
|
1270
|
+
def queue(
|
|
1271
|
+
cluster_name: str,
|
|
1272
|
+
skip_finished: bool = False,
|
|
1273
|
+
all_users: bool = False
|
|
1274
|
+
) -> server_common.RequestId[List[responses.ClusterJobRecord]]:
|
|
1273
1275
|
"""Gets the job queue of a cluster.
|
|
1274
1276
|
|
|
1275
1277
|
Args:
|
|
@@ -1282,8 +1284,8 @@ def queue(cluster_name: str,
|
|
|
1282
1284
|
The request ID of the queue request.
|
|
1283
1285
|
|
|
1284
1286
|
Request Returns:
|
|
1285
|
-
job_records (List[
|
|
1286
|
-
queue.
|
|
1287
|
+
job_records (List[responses.ClusterJobRecord]): A list of job records
|
|
1288
|
+
for each job in the queue.
|
|
1287
1289
|
|
|
1288
1290
|
.. code-block:: python
|
|
1289
1291
|
|
sky/client/sdk_async.py
CHANGED
|
@@ -523,11 +523,11 @@ async def autostop(
|
|
|
523
523
|
@usage_lib.entrypoint
|
|
524
524
|
@annotations.client_api
|
|
525
525
|
async def queue(
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
) -> List[
|
|
526
|
+
cluster_name: str,
|
|
527
|
+
skip_finished: bool = False,
|
|
528
|
+
all_users: bool = False,
|
|
529
|
+
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
|
|
530
|
+
) -> List[responses.ClusterJobRecord]:
|
|
531
531
|
"""Async version of queue() that gets the job queue of a cluster."""
|
|
532
532
|
request_id = await context_utils.to_thread(sdk.queue, cluster_name,
|
|
533
533
|
skip_finished, all_users)
|
sky/core.py
CHANGED
|
@@ -803,7 +803,7 @@ def autostop(
|
|
|
803
803
|
@usage_lib.entrypoint
|
|
804
804
|
def queue(cluster_name: str,
|
|
805
805
|
skip_finished: bool = False,
|
|
806
|
-
all_users: bool = False) -> List[
|
|
806
|
+
all_users: bool = False) -> List[responses.ClusterJobRecord]:
|
|
807
807
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
808
808
|
"""Gets the job queue of a cluster.
|
|
809
809
|
|
|
@@ -850,7 +850,7 @@ def queue(cluster_name: str,
|
|
|
850
850
|
|
|
851
851
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
852
852
|
|
|
853
|
-
if
|
|
853
|
+
if not use_legacy:
|
|
854
854
|
try:
|
|
855
855
|
request = jobsv1_pb2.GetJobQueueRequest(user_hash=user_hash,
|
|
856
856
|
all_jobs=all_jobs)
|
|
@@ -879,7 +879,6 @@ def queue(cluster_name: str,
|
|
|
879
879
|
jobs.append(job_dict)
|
|
880
880
|
except exceptions.SkyletMethodNotImplementedError:
|
|
881
881
|
use_legacy = True
|
|
882
|
-
|
|
883
882
|
if use_legacy:
|
|
884
883
|
code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
|
|
885
884
|
returncode, jobs_payload, stderr = backend.run_on_head(
|
|
@@ -891,7 +890,7 @@ def queue(cluster_name: str,
|
|
|
891
890
|
stderr=f'{jobs_payload + stderr}',
|
|
892
891
|
stream_logs=True)
|
|
893
892
|
jobs = job_lib.load_job_queue(jobs_payload)
|
|
894
|
-
return jobs
|
|
893
|
+
return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
|
|
895
894
|
|
|
896
895
|
|
|
897
896
|
@usage_lib.entrypoint
|
sky/dashboard/out/404.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-7340bc0f0dd8ae74.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js" defer=""></script><script src="/dashboard/_next/static/UDSEoDB67vwFMZyCJ4HWU/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/UDSEoDB67vwFMZyCJ4HWU/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"UDSEoDB67vwFMZyCJ4HWU","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js
RENAMED
|
@@ -1 +1 @@
|
|
|
1
|
-
self.__BUILD_MANIFEST=function(s,c,a,t,e,f,u,n,b,o,j,i,r,
|
|
1
|
+
self.__BUILD_MANIFEST=function(s,c,a,t,e,f,u,n,b,o,j,i,r,d){return{__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/":["static/chunks/pages/index-444f1804401f04ea.js"],"/_error":["static/chunks/pages/_error-c66a4e8afc46f17b.js"],"/clusters":["static/chunks/pages/clusters-469814d711d63b1b.js"],"/clusters/[cluster]":[s,c,a,f,u,"static/chunks/4676-9da7fdbde90b5549.js",o,t,e,n,j,b,i,"static/chunks/6856-5fdc9b851a18acdb.js",r,d,"static/chunks/9037-d0c00018a5ba198c.js","static/chunks/pages/clusters/[cluster]-e052384df65ef200.js"],"/clusters/[cluster]/[job]":[s,c,a,f,t,e,b,"static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js"],"/config":["static/chunks/pages/config-dfb9bf07b13045f4.js"],"/infra":["static/chunks/pages/infra-aabba60d57826e0f.js"],"/infra/[context]":["static/chunks/pages/infra/[context]-6563820e094f68ca.js"],"/jobs":["static/chunks/pages/jobs-1f70d9faa564804f.js"],"/jobs/pools/[pool]":[s,c,a,u,o,t,e,n,"static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js"],"/jobs/[job]":[s,c,a,f,u,o,t,e,n,b,"static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js"],"/users":["static/chunks/pages/users-018bf31cda52e11b.js"],"/volumes":["static/chunks/pages/volumes-739726d6b823f532.js"],"/workspace/new":["static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js"],"/workspaces":["static/chunks/pages/workspaces-7528cc0ef8c522c5.js"],"/workspaces/[name]":[s,c,a,f,u,"static/chunks/1836-37fede578e2da5f8.js",t,e,n,j,b,i,r,d,"static/chunks/1141-159df2d4c441a9d1.js","static/chunks/pages/workspaces/[name]-af76bb06dbb3954f.js"],sortedPages:["/","/_app","/_error","/clusters","/clusters/[cluster]","/clusters/[cluster]/[job]","/config","/infra","/infra/[context]","/jobs","/jobs/pools/[pool]","/jobs/[job]","/users","/volumes","/workspace/new","/workspaces","/workspaces/[name]"]}}("static/chunks/616-3d59f75e2ccf9321.js","static/chunks/6130-2be46d70a38f1e82.js","static/chunks/5739-d67458fcb1386c92.js","static/chunks/6989-01359c57e018caa4.js","static/chunks/3850-ff4a9a69d978632b.js","static/chunks/7411-b15471acd2cba716.js","static/chunks/1272-1ef0bf0237faccdb.js","static/chunks/8969-d8bc3a2b9cf839a9.js","static/chunks/6135-4b4d5e824b7f9d3c.js","static/chunks/754-d0da8ab45f9509e9.js","static/chunks/6990-f6818c84ed8f1c86.js","static/chunks/1121-d0782b9251f0fcd3.js","static/chunks/6601-06114c982db410b6.js","static/chunks/3015-88c7c8d69b0b6dba.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
|