skypilot-nightly 1.0.0.dev20250926__py3-none-any.whl → 1.0.0.dev20251001__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +43 -14
- sky/backends/cloud_vm_ray_backend.py +153 -38
- sky/check.py +0 -29
- sky/client/cli/command.py +48 -26
- sky/client/cli/table_utils.py +91 -0
- sky/client/sdk.py +14 -23
- sky/client/sdk_async.py +5 -5
- sky/core.py +18 -20
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-07349868f7905d37.js → [pool]-509b2977a6373bf6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-8e64d11e58eab5cb.js → webpack-4f0c389a4ce5fd9c.js} +1 -1
- sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → m3YT2i5s6v4SsIdYc8WZa}/_buildManifest.js +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +11 -0
- sky/data/storage_utils.py +1 -45
- sky/execution.py +0 -1
- sky/global_user_state.py +3 -3
- sky/jobs/client/sdk.py +3 -2
- sky/jobs/controller.py +15 -0
- sky/jobs/server/core.py +120 -28
- sky/jobs/server/server.py +1 -1
- sky/jobs/server/utils.py +65 -32
- sky/jobs/state.py +145 -3
- sky/jobs/utils.py +87 -8
- sky/provision/kubernetes/instance.py +1 -1
- sky/schemas/api/responses.py +73 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/serve/serve_utils.py +16 -0
- sky/serve/server/core.py +1 -1
- sky/serve/server/impl.py +6 -6
- sky/server/common.py +2 -1
- sky/server/requests/serializers/decoders.py +10 -6
- sky/server/requests/serializers/encoders.py +13 -8
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +2 -32
- sky/skylet/log_lib.py +211 -0
- sky/skylet/log_lib.pyi +30 -1
- sky/skylet/services.py +208 -2
- sky/skylet/skylet.py +3 -0
- sky/task.py +4 -0
- sky/utils/cluster_utils.py +23 -5
- sky/utils/command_runner.py +21 -5
- sky/utils/command_runner.pyi +11 -0
- sky/utils/volume.py +5 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/METADATA +35 -35
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/RECORD +70 -66
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
- /sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → m3YT2i5s6v4SsIdYc8WZa}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20251001.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ import urllib.request
|
|
|
7
7
|
from sky.utils import directory_utils
|
|
8
8
|
|
|
9
9
|
# Replaced with the current commit when building the wheels.
|
|
10
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
10
|
+
_SKYPILOT_COMMIT_SHA = '047e366ad4c073a5753b62cc5d4c3c4660b1476a'
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def _get_git_commit():
|
|
@@ -37,7 +37,7 @@ def _get_git_commit():
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
__commit__ = _get_git_commit()
|
|
40
|
-
__version__ = '1.0.0.
|
|
40
|
+
__version__ = '1.0.0.dev20251001'
|
|
41
41
|
__root_dir__ = directory_utils.get_sky_dir()
|
|
42
42
|
|
|
43
43
|
|
sky/backends/backend_utils.py
CHANGED
|
@@ -16,8 +16,8 @@ import tempfile
|
|
|
16
16
|
import threading
|
|
17
17
|
import time
|
|
18
18
|
import typing
|
|
19
|
-
from typing import (Any, Callable, Dict, List, Optional, Sequence,
|
|
20
|
-
TypeVar, Union)
|
|
19
|
+
from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
|
|
20
|
+
Set, Tuple, TypeVar, Union)
|
|
21
21
|
import uuid
|
|
22
22
|
|
|
23
23
|
import aiohttp
|
|
@@ -723,11 +723,15 @@ def write_cluster_config(
|
|
|
723
723
|
'is not supported by this cloud. Remove the config or set: '
|
|
724
724
|
'`remote_identity: LOCAL_CREDENTIALS`.')
|
|
725
725
|
if isinstance(cloud, clouds.Kubernetes):
|
|
726
|
-
|
|
726
|
+
allowed_contexts = skypilot_config.get_workspace_cloud(
|
|
727
|
+
'kubernetes').get('allowed_contexts', None)
|
|
728
|
+
if allowed_contexts is None:
|
|
729
|
+
allowed_contexts = skypilot_config.get_effective_region_config(
|
|
727
730
|
cloud='kubernetes',
|
|
728
731
|
region=None,
|
|
729
732
|
keys=('allowed_contexts',),
|
|
730
|
-
default_value=None)
|
|
733
|
+
default_value=None)
|
|
734
|
+
if allowed_contexts is None:
|
|
731
735
|
excluded_clouds.add(cloud)
|
|
732
736
|
else:
|
|
733
737
|
excluded_clouds.add(cloud)
|
|
@@ -1226,7 +1230,6 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
|
|
|
1226
1230
|
Rather than constructing the whole byte sequence, which may be quite large,
|
|
1227
1231
|
we construct it incrementally by using hash.update() to add new bytes.
|
|
1228
1232
|
"""
|
|
1229
|
-
|
|
1230
1233
|
# Load the yaml contents so that we can directly remove keys.
|
|
1231
1234
|
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1232
1235
|
for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
|
|
@@ -2614,7 +2617,7 @@ def refresh_cluster_record(
|
|
|
2614
2617
|
cluster_name: str,
|
|
2615
2618
|
*,
|
|
2616
2619
|
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
|
|
2617
|
-
|
|
2620
|
+
cluster_lock_already_held: bool = False,
|
|
2618
2621
|
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
|
|
2619
2622
|
include_user_info: bool = True,
|
|
2620
2623
|
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
@@ -2634,9 +2637,13 @@ def refresh_cluster_record(
|
|
|
2634
2637
|
_CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
|
|
2635
2638
|
1. the cluster is a spot cluster, or
|
|
2636
2639
|
2. cluster autostop is set and the cluster is not STOPPED.
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
+
cluster_lock_already_held: Whether the caller is already holding the
|
|
2641
|
+
per-cluster lock. You MUST NOT set this to True if the caller does not
|
|
2642
|
+
already hold the lock. If True, we will not acquire the lock before
|
|
2643
|
+
updating the status. Failing to hold the lock while updating the
|
|
2644
|
+
status can lead to correctness issues - e.g. an launch in-progress may
|
|
2645
|
+
appear to be DOWN incorrectly. Even if this is set to False, the lock
|
|
2646
|
+
may not be acquired if the status does not need to be refreshed.
|
|
2640
2647
|
cluster_status_lock_timeout: The timeout to acquire the per-cluster
|
|
2641
2648
|
lock. If timeout, the function will use the cached status. If the
|
|
2642
2649
|
value is <0, do not timeout (wait for the lock indefinitely). By
|
|
@@ -2687,7 +2694,7 @@ def refresh_cluster_record(
|
|
|
2687
2694
|
if not _must_refresh_cluster_status(record, force_refresh_statuses):
|
|
2688
2695
|
return record
|
|
2689
2696
|
|
|
2690
|
-
if
|
|
2697
|
+
if cluster_lock_already_held:
|
|
2691
2698
|
return _update_cluster_status(cluster_name, include_user_info,
|
|
2692
2699
|
summary_response)
|
|
2693
2700
|
|
|
@@ -2741,7 +2748,7 @@ def refresh_cluster_status_handle(
|
|
|
2741
2748
|
cluster_name: str,
|
|
2742
2749
|
*,
|
|
2743
2750
|
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
|
|
2744
|
-
|
|
2751
|
+
cluster_lock_already_held: bool = False,
|
|
2745
2752
|
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
|
|
2746
2753
|
) -> Tuple[Optional[status_lib.ClusterStatus],
|
|
2747
2754
|
Optional[backends.ResourceHandle]]:
|
|
@@ -2754,7 +2761,7 @@ def refresh_cluster_status_handle(
|
|
|
2754
2761
|
record = refresh_cluster_record(
|
|
2755
2762
|
cluster_name,
|
|
2756
2763
|
force_refresh_statuses=force_refresh_statuses,
|
|
2757
|
-
|
|
2764
|
+
cluster_lock_already_held=cluster_lock_already_held,
|
|
2758
2765
|
cluster_status_lock_timeout=cluster_status_lock_timeout,
|
|
2759
2766
|
include_user_info=False,
|
|
2760
2767
|
summary_response=True)
|
|
@@ -3079,7 +3086,7 @@ def _refresh_cluster(
|
|
|
3079
3086
|
record = refresh_cluster_record(
|
|
3080
3087
|
cluster_name,
|
|
3081
3088
|
force_refresh_statuses=force_refresh_statuses,
|
|
3082
|
-
|
|
3089
|
+
cluster_lock_already_held=False,
|
|
3083
3090
|
include_user_info=include_user_info,
|
|
3084
3091
|
summary_response=summary_response)
|
|
3085
3092
|
except (exceptions.ClusterStatusFetchingError,
|
|
@@ -3859,13 +3866,35 @@ def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
|
|
|
3859
3866
|
) from last_exception
|
|
3860
3867
|
|
|
3861
3868
|
|
|
3869
|
+
def invoke_skylet_streaming_with_retries(
|
|
3870
|
+
stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
|
|
3871
|
+
"""Generic helper for making Skylet streaming gRPC requests."""
|
|
3872
|
+
max_attempts = 3
|
|
3873
|
+
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3874
|
+
last_exception: Optional[Exception] = None
|
|
3875
|
+
|
|
3876
|
+
for _ in range(max_attempts):
|
|
3877
|
+
try:
|
|
3878
|
+
for response in stream_func():
|
|
3879
|
+
yield response
|
|
3880
|
+
return
|
|
3881
|
+
except grpc.RpcError as e:
|
|
3882
|
+
last_exception = e
|
|
3883
|
+
_handle_grpc_error(e, backoff.current_backoff())
|
|
3884
|
+
|
|
3885
|
+
raise RuntimeError(
|
|
3886
|
+
f'Failed to stream Skylet response after {max_attempts} attempts'
|
|
3887
|
+
) from last_exception
|
|
3888
|
+
|
|
3889
|
+
|
|
3862
3890
|
def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
|
|
3863
3891
|
if e.code() == grpc.StatusCode.INTERNAL:
|
|
3864
3892
|
with ux_utils.print_exception_no_traceback():
|
|
3865
3893
|
raise exceptions.SkyletInternalError(e.details())
|
|
3866
3894
|
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3867
3895
|
time.sleep(current_backoff)
|
|
3868
|
-
elif e.code() == grpc.StatusCode.UNIMPLEMENTED
|
|
3896
|
+
elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
|
|
3897
|
+
) == grpc.StatusCode.UNKNOWN:
|
|
3869
3898
|
# Handle backwards compatibility: old server doesn't implement this RPC.
|
|
3870
3899
|
# Let the caller fall back to legacy execution.
|
|
3871
3900
|
raise exceptions.SkyletMethodNotImplementedError(
|
|
@@ -19,8 +19,8 @@ import textwrap
|
|
|
19
19
|
import threading
|
|
20
20
|
import time
|
|
21
21
|
import typing
|
|
22
|
-
from typing import (Any, Callable, Dict, Iterable, List, Optional,
|
|
23
|
-
Union)
|
|
22
|
+
from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
|
|
23
|
+
Set, Tuple, Union)
|
|
24
24
|
|
|
25
25
|
import colorama
|
|
26
26
|
import psutil
|
|
@@ -91,6 +91,8 @@ if typing.TYPE_CHECKING:
|
|
|
91
91
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
92
92
|
from sky.schemas.generated import jobsv1_pb2
|
|
93
93
|
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
94
|
+
from sky.schemas.generated import managed_jobsv1_pb2
|
|
95
|
+
from sky.schemas.generated import managed_jobsv1_pb2_grpc
|
|
94
96
|
from sky.schemas.generated import servev1_pb2
|
|
95
97
|
from sky.schemas.generated import servev1_pb2_grpc
|
|
96
98
|
else:
|
|
@@ -111,6 +113,10 @@ else:
|
|
|
111
113
|
'sky.schemas.generated.servev1_pb2')
|
|
112
114
|
servev1_pb2_grpc = adaptors_common.LazyImport(
|
|
113
115
|
'sky.schemas.generated.servev1_pb2_grpc')
|
|
116
|
+
managed_jobsv1_pb2 = adaptors_common.LazyImport(
|
|
117
|
+
'sky.schemas.generated.managed_jobsv1_pb2')
|
|
118
|
+
managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
119
|
+
'sky.schemas.generated.managed_jobsv1_pb2_grpc')
|
|
114
120
|
|
|
115
121
|
Path = str
|
|
116
122
|
|
|
@@ -2737,6 +2743,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2737
2743
|
(tunnel.port, tunnel.pid) if tunnel is not None else None)
|
|
2738
2744
|
|
|
2739
2745
|
def get_grpc_channel(self) -> 'grpc.Channel':
|
|
2746
|
+
grpc_options = [
|
|
2747
|
+
# The task YAMLs can be large, so the default
|
|
2748
|
+
# max_receive_message_length of 4MB might not be enough.
|
|
2749
|
+
('grpc.max_receive_message_length', -1),
|
|
2750
|
+
]
|
|
2740
2751
|
# It's fine to not grab the lock here, as we're only reading,
|
|
2741
2752
|
# and writes are very rare.
|
|
2742
2753
|
# It's acceptable to read while another process is opening a tunnel,
|
|
@@ -2753,7 +2764,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2753
2764
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
2754
2765
|
s.settimeout(0.5)
|
|
2755
2766
|
s.connect(('localhost', tunnel.port))
|
|
2756
|
-
return grpc.insecure_channel(f'localhost:{tunnel.port}'
|
|
2767
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2768
|
+
options=grpc_options)
|
|
2757
2769
|
except socket.error as e:
|
|
2758
2770
|
logger.warning(
|
|
2759
2771
|
'Failed to connect to SSH tunnel for cluster '
|
|
@@ -2772,19 +2784,22 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
|
2772
2784
|
f'{self.cluster_name!r}, '
|
|
2773
2785
|
'opening the tunnel')
|
|
2774
2786
|
tunnel = self._open_and_update_skylet_tunnel()
|
|
2775
|
-
return grpc.insecure_channel(f'localhost:{tunnel.port}'
|
|
2787
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2788
|
+
options=grpc_options)
|
|
2776
2789
|
try:
|
|
2777
2790
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
2778
2791
|
s.settimeout(0.5)
|
|
2779
2792
|
s.connect(('localhost', tunnel.port))
|
|
2780
|
-
return grpc.insecure_channel(f'localhost:{tunnel.port}'
|
|
2793
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2794
|
+
options=grpc_options)
|
|
2781
2795
|
except socket.error as e:
|
|
2782
2796
|
logger.warning(
|
|
2783
2797
|
'Failed to connect to SSH tunnel for cluster '
|
|
2784
2798
|
f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
|
|
2785
2799
|
'opening new tunnel')
|
|
2786
2800
|
tunnel = self._open_and_update_skylet_tunnel()
|
|
2787
|
-
return grpc.insecure_channel(f'localhost:{tunnel.port}'
|
|
2801
|
+
return grpc.insecure_channel(f'localhost:{tunnel.port}',
|
|
2802
|
+
options=grpc_options)
|
|
2788
2803
|
except locks.LockTimeout as e:
|
|
2789
2804
|
raise RuntimeError(
|
|
2790
2805
|
'Failed to get gRPC channel for cluster '
|
|
@@ -3060,6 +3075,8 @@ class SkyletClient:
|
|
|
3060
3075
|
self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
|
|
3061
3076
|
self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
|
|
3062
3077
|
self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
|
|
3078
|
+
self._managed_jobs_stub = (
|
|
3079
|
+
managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
|
|
3063
3080
|
|
|
3064
3081
|
def set_autostop(
|
|
3065
3082
|
self,
|
|
@@ -3146,6 +3163,13 @@ class SkyletClient:
|
|
|
3146
3163
|
) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
|
|
3147
3164
|
return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
|
|
3148
3165
|
|
|
3166
|
+
def tail_logs(
|
|
3167
|
+
self,
|
|
3168
|
+
request: 'jobsv1_pb2.TailLogsRequest',
|
|
3169
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3170
|
+
) -> Iterator['jobsv1_pb2.TailLogsResponse']:
|
|
3171
|
+
return self._jobs_stub.TailLogs(request, timeout=timeout)
|
|
3172
|
+
|
|
3149
3173
|
def get_service_status(
|
|
3150
3174
|
self,
|
|
3151
3175
|
request: 'servev1_pb2.GetServiceStatusRequest',
|
|
@@ -3194,6 +3218,35 @@ class SkyletClient:
|
|
|
3194
3218
|
) -> 'servev1_pb2.UpdateServiceResponse':
|
|
3195
3219
|
return self._serve_stub.UpdateService(request, timeout=timeout)
|
|
3196
3220
|
|
|
3221
|
+
def get_managed_job_controller_version(
|
|
3222
|
+
self,
|
|
3223
|
+
request: 'managed_jobsv1_pb2.GetVersionRequest',
|
|
3224
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3225
|
+
) -> 'managed_jobsv1_pb2.GetVersionResponse':
|
|
3226
|
+
return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
|
|
3227
|
+
|
|
3228
|
+
def get_managed_job_table(
|
|
3229
|
+
self,
|
|
3230
|
+
request: 'managed_jobsv1_pb2.GetJobTableRequest',
|
|
3231
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3232
|
+
) -> 'managed_jobsv1_pb2.GetJobTableResponse':
|
|
3233
|
+
return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
|
|
3234
|
+
|
|
3235
|
+
def get_all_managed_job_ids_by_name(
|
|
3236
|
+
self,
|
|
3237
|
+
request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
|
|
3238
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3239
|
+
) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
|
|
3240
|
+
return self._managed_jobs_stub.GetAllJobIdsByName(request,
|
|
3241
|
+
timeout=timeout)
|
|
3242
|
+
|
|
3243
|
+
def cancel_managed_jobs(
|
|
3244
|
+
self,
|
|
3245
|
+
request: 'managed_jobsv1_pb2.CancelJobsRequest',
|
|
3246
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3247
|
+
) -> 'managed_jobsv1_pb2.CancelJobsResponse':
|
|
3248
|
+
return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
|
|
3249
|
+
|
|
3197
3250
|
|
|
3198
3251
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
|
3199
3252
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
@@ -3706,7 +3759,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3706
3759
|
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
|
3707
3760
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3708
3761
|
|
|
3709
|
-
if
|
|
3762
|
+
if not use_legacy:
|
|
3710
3763
|
try:
|
|
3711
3764
|
request = jobsv1_pb2.UpdateStatusRequest()
|
|
3712
3765
|
backend_utils.invoke_skylet_with_retries(
|
|
@@ -3730,7 +3783,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3730
3783
|
# 2. On next `sky start`, it gets reset to FAILED.
|
|
3731
3784
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3732
3785
|
|
|
3733
|
-
if
|
|
3786
|
+
if not use_legacy:
|
|
3734
3787
|
try:
|
|
3735
3788
|
fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
|
|
3736
3789
|
backend_utils.invoke_skylet_with_retries(
|
|
@@ -4165,7 +4218,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4165
4218
|
_dump_code_to_file(job_submit_cmd,
|
|
4166
4219
|
constants.PERSISTENT_RUN_SCRIPT_DIR)
|
|
4167
4220
|
|
|
4168
|
-
if
|
|
4221
|
+
if not use_legacy:
|
|
4169
4222
|
try:
|
|
4170
4223
|
managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
|
|
4171
4224
|
if managed_job_dag is not None:
|
|
@@ -4297,7 +4350,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4297
4350
|
metadata: str) -> Tuple[int, str]:
|
|
4298
4351
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4299
4352
|
|
|
4300
|
-
if
|
|
4353
|
+
if not use_legacy:
|
|
4301
4354
|
try:
|
|
4302
4355
|
request = jobsv1_pb2.AddJobRequest(
|
|
4303
4356
|
job_name=job_name,
|
|
@@ -4567,7 +4620,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4567
4620
|
"""
|
|
4568
4621
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4569
4622
|
|
|
4570
|
-
if
|
|
4623
|
+
if not use_legacy:
|
|
4571
4624
|
try:
|
|
4572
4625
|
request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
|
|
4573
4626
|
cancel_all=cancel_all,
|
|
@@ -4610,7 +4663,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4610
4663
|
job_to_dir: Dict[str, str] = {}
|
|
4611
4664
|
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4612
4665
|
|
|
4613
|
-
if
|
|
4666
|
+
if not use_legacy:
|
|
4614
4667
|
try:
|
|
4615
4668
|
int_job_ids = []
|
|
4616
4669
|
if job_ids:
|
|
@@ -4724,6 +4777,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4724
4777
|
The exit code of the tail command. Returns code 100 if the job has
|
|
4725
4778
|
failed. See exceptions.JobExitCode for possible return codes.
|
|
4726
4779
|
"""
|
|
4780
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4781
|
+
last_exit_code = 0
|
|
4782
|
+
try:
|
|
4783
|
+
request = jobsv1_pb2.TailLogsRequest(
|
|
4784
|
+
job_id=job_id,
|
|
4785
|
+
managed_job_id=managed_job_id,
|
|
4786
|
+
follow=follow,
|
|
4787
|
+
tail=tail)
|
|
4788
|
+
for resp in backend_utils.invoke_skylet_streaming_with_retries(
|
|
4789
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4790
|
+
).tail_logs(request, timeout=None)):
|
|
4791
|
+
if resp.log_line:
|
|
4792
|
+
print(resp.log_line, end='', flush=True)
|
|
4793
|
+
last_exit_code = resp.exit_code
|
|
4794
|
+
return last_exit_code
|
|
4795
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4796
|
+
pass
|
|
4797
|
+
except grpc.RpcError as e:
|
|
4798
|
+
if e.code() == grpc.StatusCode.CANCELLED:
|
|
4799
|
+
return last_exit_code
|
|
4800
|
+
raise e
|
|
4801
|
+
|
|
4727
4802
|
code = job_lib.JobLibCodeGen.tail_logs(job_id,
|
|
4728
4803
|
managed_job_id=managed_job_id,
|
|
4729
4804
|
follow=follow,
|
|
@@ -4761,6 +4836,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4761
4836
|
tail: Optional[int] = None) -> int:
|
|
4762
4837
|
# if job_name is not None, job_id should be None
|
|
4763
4838
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
4839
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
4764
4840
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4765
4841
|
job_name, job_id, follow, controller, tail)
|
|
4766
4842
|
|
|
@@ -4806,20 +4882,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4806
4882
|
assert job_name is None or job_id is None, (job_name, job_id)
|
|
4807
4883
|
|
|
4808
4884
|
if job_id is None:
|
|
4809
|
-
#
|
|
4885
|
+
# get the job_id
|
|
4810
4886
|
# if job_name is None, get all job_ids
|
|
4811
4887
|
# TODO: Only get the latest job_id, since that's the only one we use
|
|
4812
|
-
|
|
4813
|
-
|
|
4814
|
-
|
|
4815
|
-
|
|
4816
|
-
|
|
4817
|
-
|
|
4818
|
-
|
|
4819
|
-
|
|
4820
|
-
|
|
4821
|
-
|
|
4822
|
-
|
|
4888
|
+
|
|
4889
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4890
|
+
logger.info(f'handle.is_grpc_enabled_with_flag: '
|
|
4891
|
+
f'{handle.is_grpc_enabled_with_flag}')
|
|
4892
|
+
if not use_legacy:
|
|
4893
|
+
try:
|
|
4894
|
+
request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
|
|
4895
|
+
job_name=job_name)
|
|
4896
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4897
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4898
|
+
)).get_all_managed_job_ids_by_name(request))
|
|
4899
|
+
job_ids = list(response.job_ids)
|
|
4900
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4901
|
+
use_legacy = True
|
|
4902
|
+
|
|
4903
|
+
if use_legacy:
|
|
4904
|
+
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
|
4905
|
+
job_name=job_name)
|
|
4906
|
+
returncode, job_ids_payload, stderr = self.run_on_head(
|
|
4907
|
+
handle,
|
|
4908
|
+
code,
|
|
4909
|
+
stream_logs=False,
|
|
4910
|
+
require_outputs=True,
|
|
4911
|
+
separate_stderr=True)
|
|
4912
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4913
|
+
'Failed to sync down logs.',
|
|
4914
|
+
stderr)
|
|
4915
|
+
job_ids = message_utils.decode_payload(job_ids_payload)
|
|
4823
4916
|
if not job_ids:
|
|
4824
4917
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4825
4918
|
'No matching job found'
|
|
@@ -4847,18 +4940,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4847
4940
|
else:
|
|
4848
4941
|
# get the run_timestamp
|
|
4849
4942
|
# the function takes in [job_id]
|
|
4850
|
-
|
|
4851
|
-
|
|
4852
|
-
|
|
4853
|
-
|
|
4854
|
-
|
|
4855
|
-
|
|
4856
|
-
|
|
4857
|
-
|
|
4858
|
-
|
|
4859
|
-
|
|
4860
|
-
|
|
4861
|
-
|
|
4943
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4944
|
+
if not use_legacy:
|
|
4945
|
+
try:
|
|
4946
|
+
log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
4947
|
+
job_ids=[job_id])
|
|
4948
|
+
log_dirs_response = (
|
|
4949
|
+
backend_utils.invoke_skylet_with_retries(
|
|
4950
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
4951
|
+
)).get_log_dirs_for_jobs(log_dirs_request)))
|
|
4952
|
+
job_log_dirs = log_dirs_response.job_log_dirs
|
|
4953
|
+
# Convert back to the expected format
|
|
4954
|
+
# {job_id: run_timestamp}
|
|
4955
|
+
run_timestamps = {}
|
|
4956
|
+
for jid, log_dir in job_log_dirs.items():
|
|
4957
|
+
run_timestamps[int(jid)] = log_dir
|
|
4958
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4959
|
+
use_legacy = True
|
|
4960
|
+
|
|
4961
|
+
if use_legacy:
|
|
4962
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
|
|
4963
|
+
[str(job_id)])
|
|
4964
|
+
returncode, run_timestamps_payload, stderr = self.run_on_head(
|
|
4965
|
+
handle,
|
|
4966
|
+
code,
|
|
4967
|
+
stream_logs=False,
|
|
4968
|
+
require_outputs=True,
|
|
4969
|
+
separate_stderr=True)
|
|
4970
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4971
|
+
'Failed to sync logs.',
|
|
4972
|
+
stderr)
|
|
4973
|
+
# returns with a dict of {job_id: run_timestamp}
|
|
4974
|
+
run_timestamps = message_utils.decode_payload(
|
|
4975
|
+
run_timestamps_payload)
|
|
4862
4976
|
if not run_timestamps:
|
|
4863
4977
|
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4864
4978
|
'No matching log directories found'
|
|
@@ -4925,6 +5039,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4925
5039
|
exist_ok=True)
|
|
4926
5040
|
log_file = os.path.join(local_log_dir, 'run.log')
|
|
4927
5041
|
|
|
5042
|
+
# TODO(kevin): Migrate stream_logs to gRPC
|
|
4928
5043
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4929
5044
|
job_name=None,
|
|
4930
5045
|
job_id=int(job_id),
|
|
@@ -5006,7 +5121,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5006
5121
|
# observed in AWS. See also
|
|
5007
5122
|
# _LAUNCH_DOUBLE_CHECK_WINDOW in backend_utils.py.
|
|
5008
5123
|
force_refresh_statuses={status_lib.ClusterStatus.INIT},
|
|
5009
|
-
|
|
5124
|
+
cluster_lock_already_held=True))
|
|
5010
5125
|
cluster_status_fetched = True
|
|
5011
5126
|
except exceptions.ClusterStatusFetchingError:
|
|
5012
5127
|
logger.warning(
|
|
@@ -5610,7 +5725,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5610
5725
|
record = backend_utils.refresh_cluster_record(
|
|
5611
5726
|
cluster_name,
|
|
5612
5727
|
force_refresh_statuses={status_lib.ClusterStatus.INIT},
|
|
5613
|
-
|
|
5728
|
+
cluster_lock_already_held=True,
|
|
5614
5729
|
include_user_info=False,
|
|
5615
5730
|
summary_response=True,
|
|
5616
5731
|
)
|
sky/check.py
CHANGED
|
@@ -621,35 +621,6 @@ def _format_enabled_cloud(cloud_name: str,
|
|
|
621
621
|
if cloud_name in [repr(sky_clouds.Kubernetes()), repr(sky_clouds.SSH())]:
|
|
622
622
|
return (f'{title}' + _format_context_details(
|
|
623
623
|
cloud_name, show_details=False, ctx2text=ctx2text))
|
|
624
|
-
|
|
625
|
-
if cloud_name == repr(sky_clouds.Kubernetes()):
|
|
626
|
-
# Get enabled contexts for Kubernetes
|
|
627
|
-
existing_contexts = sky_clouds.Kubernetes.existing_allowed_contexts()
|
|
628
|
-
if not existing_contexts:
|
|
629
|
-
return _green_color(cloud_and_capabilities)
|
|
630
|
-
|
|
631
|
-
# Check if allowed_contexts is explicitly set in config
|
|
632
|
-
allowed_contexts = skypilot_config.get_effective_region_config(
|
|
633
|
-
cloud='kubernetes',
|
|
634
|
-
region=None,
|
|
635
|
-
keys=('allowed_contexts',),
|
|
636
|
-
default_value=None)
|
|
637
|
-
|
|
638
|
-
# Format the context info with consistent styling
|
|
639
|
-
if allowed_contexts is not None:
|
|
640
|
-
contexts_formatted = []
|
|
641
|
-
for i, context in enumerate(existing_contexts):
|
|
642
|
-
symbol = (ux_utils.INDENT_LAST_SYMBOL
|
|
643
|
-
if i == len(existing_contexts) -
|
|
644
|
-
1 else ux_utils.INDENT_SYMBOL)
|
|
645
|
-
contexts_formatted.append(f'\n {symbol}{context}')
|
|
646
|
-
context_info = f' Allowed contexts:{"".join(contexts_formatted)}'
|
|
647
|
-
else:
|
|
648
|
-
context_info = f' Active context: {existing_contexts[0]}'
|
|
649
|
-
|
|
650
|
-
return (f'{_green_color(cloud_and_capabilities)}\n'
|
|
651
|
-
f' {colorama.Style.DIM}{context_info}'
|
|
652
|
-
f'{colorama.Style.RESET_ALL}')
|
|
653
624
|
return _green_color(cloud_and_capabilities)
|
|
654
625
|
|
|
655
626
|
|