skypilot-nightly 1.0.0.dev20250912__py3-none-any.whl → 1.0.0.dev20250914__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/seeweb.py +103 -0
- sky/authentication.py +38 -0
- sky/backends/backend_utils.py +24 -9
- sky/backends/cloud_vm_ray_backend.py +382 -151
- sky/catalog/data_fetchers/fetch_aws.py +0 -36
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/seeweb_catalog.py +184 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/seeweb.py +463 -0
- sky/core.py +46 -12
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{DAiq7V2xJnO1LSfmunZl6 → 5iak5kYp9a9ezANCb74L8}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
- sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
- sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
- sky/dashboard/out/_next/static/chunks/{webpack-e8a0c4c3c6f408fb.js → webpack-e2e3d2d3de7d43e5.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +5 -0
- sky/global_user_state.py +41 -26
- sky/jobs/utils.py +61 -13
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/utils.py +14 -3
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +806 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +252 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/setup_files/dependencies.py +8 -1
- sky/skylet/constants.py +2 -1
- sky/skylet/job_lib.py +128 -10
- sky/skylet/log_lib.py +3 -3
- sky/skylet/services.py +203 -0
- sky/skylet/skylet.py +4 -0
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/utils/cluster_utils.py +6 -2
- sky/utils/controller_utils.py +11 -5
- {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/METADATA +39 -34
- {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/RECORD +65 -54
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- /sky/dashboard/out/_next/static/{DAiq7V2xJnO1LSfmunZl6 → 5iak5kYp9a9ezANCb74L8}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/top_level.txt +0 -0
|
@@ -88,6 +88,8 @@ if typing.TYPE_CHECKING:
|
|
|
88
88
|
from sky import dag
|
|
89
89
|
from sky.schemas.generated import autostopv1_pb2
|
|
90
90
|
from sky.schemas.generated import autostopv1_pb2_grpc
|
|
91
|
+
from sky.schemas.generated import jobsv1_pb2
|
|
92
|
+
from sky.schemas.generated import jobsv1_pb2_grpc
|
|
91
93
|
else:
|
|
92
94
|
# To avoid requiring grpcio to be installed on the client side.
|
|
93
95
|
grpc = adaptors_common.LazyImport(
|
|
@@ -99,6 +101,9 @@ else:
|
|
|
99
101
|
'sky.schemas.generated.autostopv1_pb2')
|
|
100
102
|
autostopv1_pb2_grpc = adaptors_common.LazyImport(
|
|
101
103
|
'sky.schemas.generated.autostopv1_pb2_grpc')
|
|
104
|
+
jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
|
|
105
|
+
jobsv1_pb2_grpc = adaptors_common.LazyImport(
|
|
106
|
+
'sky.schemas.generated.jobsv1_pb2_grpc')
|
|
102
107
|
|
|
103
108
|
Path = str
|
|
104
109
|
|
|
@@ -225,7 +230,8 @@ def _get_cluster_config_template(cloud):
|
|
|
225
230
|
clouds.Vast: 'vast-ray.yml.j2',
|
|
226
231
|
clouds.Fluidstack: 'fluidstack-ray.yml.j2',
|
|
227
232
|
clouds.Nebius: 'nebius-ray.yml.j2',
|
|
228
|
-
clouds.Hyperbolic: 'hyperbolic-ray.yml.j2'
|
|
233
|
+
clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
|
|
234
|
+
clouds.Seeweb: 'seeweb-ray.yml.j2'
|
|
229
235
|
}
|
|
230
236
|
return cloud_to_template[type(cloud)]
|
|
231
237
|
|
|
@@ -3038,21 +3044,93 @@ class SkyletClient:
|
|
|
3038
3044
|
|
|
3039
3045
|
def __init__(self, channel: 'grpc.Channel'):
|
|
3040
3046
|
self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
|
|
3047
|
+
self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
|
|
3041
3048
|
|
|
3042
3049
|
def set_autostop(
|
|
3043
3050
|
self,
|
|
3044
3051
|
request: 'autostopv1_pb2.SetAutostopRequest',
|
|
3045
|
-
timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3052
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3046
3053
|
) -> 'autostopv1_pb2.SetAutostopResponse':
|
|
3047
3054
|
return self._autostop_stub.SetAutostop(request, timeout=timeout)
|
|
3048
3055
|
|
|
3049
3056
|
def is_autostopping(
|
|
3050
3057
|
self,
|
|
3051
3058
|
request: 'autostopv1_pb2.IsAutostoppingRequest',
|
|
3052
|
-
timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3059
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3053
3060
|
) -> 'autostopv1_pb2.IsAutostoppingResponse':
|
|
3054
3061
|
return self._autostop_stub.IsAutostopping(request, timeout=timeout)
|
|
3055
3062
|
|
|
3063
|
+
def add_job(
|
|
3064
|
+
self,
|
|
3065
|
+
request: 'jobsv1_pb2.AddJobRequest',
|
|
3066
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3067
|
+
) -> 'jobsv1_pb2.AddJobResponse':
|
|
3068
|
+
return self._jobs_stub.AddJob(request, timeout=timeout)
|
|
3069
|
+
|
|
3070
|
+
def queue_job(
|
|
3071
|
+
self,
|
|
3072
|
+
request: 'jobsv1_pb2.QueueJobRequest',
|
|
3073
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3074
|
+
) -> 'jobsv1_pb2.QueueJobResponse':
|
|
3075
|
+
return self._jobs_stub.QueueJob(request, timeout=timeout)
|
|
3076
|
+
|
|
3077
|
+
def update_status(
|
|
3078
|
+
self,
|
|
3079
|
+
request: 'jobsv1_pb2.UpdateStatusRequest',
|
|
3080
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3081
|
+
) -> 'jobsv1_pb2.UpdateStatusResponse':
|
|
3082
|
+
return self._jobs_stub.UpdateStatus(request, timeout=timeout)
|
|
3083
|
+
|
|
3084
|
+
def get_job_queue(
|
|
3085
|
+
self,
|
|
3086
|
+
request: 'jobsv1_pb2.GetJobQueueRequest',
|
|
3087
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3088
|
+
) -> 'jobsv1_pb2.GetJobQueueResponse':
|
|
3089
|
+
return self._jobs_stub.GetJobQueue(request, timeout=timeout)
|
|
3090
|
+
|
|
3091
|
+
def cancel_jobs(
|
|
3092
|
+
self,
|
|
3093
|
+
request: 'jobsv1_pb2.CancelJobsRequest',
|
|
3094
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3095
|
+
) -> 'jobsv1_pb2.CancelJobsResponse':
|
|
3096
|
+
return self._jobs_stub.CancelJobs(request, timeout=timeout)
|
|
3097
|
+
|
|
3098
|
+
def fail_all_in_progress_jobs(
|
|
3099
|
+
self,
|
|
3100
|
+
request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
|
|
3101
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3102
|
+
) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
|
|
3103
|
+
return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
|
|
3104
|
+
|
|
3105
|
+
def get_job_status(
|
|
3106
|
+
self,
|
|
3107
|
+
request: 'jobsv1_pb2.GetJobStatusRequest',
|
|
3108
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3109
|
+
) -> 'jobsv1_pb2.GetJobStatusResponse':
|
|
3110
|
+
return self._jobs_stub.GetJobStatus(request, timeout=timeout)
|
|
3111
|
+
|
|
3112
|
+
def get_job_submitted_timestamp(
|
|
3113
|
+
self,
|
|
3114
|
+
request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
|
|
3115
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3116
|
+
) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
|
|
3117
|
+
return self._jobs_stub.GetJobSubmittedTimestamp(request,
|
|
3118
|
+
timeout=timeout)
|
|
3119
|
+
|
|
3120
|
+
def get_job_ended_timestamp(
|
|
3121
|
+
self,
|
|
3122
|
+
request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
|
|
3123
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3124
|
+
) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
|
|
3125
|
+
return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
|
|
3126
|
+
|
|
3127
|
+
def get_log_dirs_for_jobs(
|
|
3128
|
+
self,
|
|
3129
|
+
request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
|
|
3130
|
+
timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
|
|
3131
|
+
) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
|
|
3132
|
+
return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
|
|
3133
|
+
|
|
3056
3134
|
|
|
3057
3135
|
@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
|
|
3058
3136
|
class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
@@ -3559,16 +3637,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3559
3637
|
# update_status will query the ray job status for all INIT /
|
|
3560
3638
|
# PENDING / RUNNING jobs for the real status, since we do not
|
|
3561
3639
|
# know the actual previous status of the cluster.
|
|
3562
|
-
cmd = job_lib.JobLibCodeGen.update_status()
|
|
3563
3640
|
logger.debug('Update job queue on remote cluster.')
|
|
3564
3641
|
with rich_utils.safe_status(
|
|
3565
3642
|
ux_utils.spinner_message('Preparing SkyPilot runtime')):
|
|
3566
|
-
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
|
|
3643
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3644
|
+
|
|
3645
|
+
if handle.is_grpc_enabled_with_flag:
|
|
3646
|
+
try:
|
|
3647
|
+
request = jobsv1_pb2.UpdateStatusRequest()
|
|
3648
|
+
backend_utils.invoke_skylet_with_retries(
|
|
3649
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
3650
|
+
).update_status(request))
|
|
3651
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3652
|
+
use_legacy = True
|
|
3653
|
+
|
|
3654
|
+
if use_legacy:
|
|
3655
|
+
cmd = job_lib.JobLibCodeGen.update_status()
|
|
3656
|
+
returncode, _, stderr = self.run_on_head(
|
|
3657
|
+
handle, cmd, require_outputs=True)
|
|
3658
|
+
subprocess_utils.handle_returncode(
|
|
3659
|
+
returncode, cmd, 'Failed to update job status.', stderr)
|
|
3572
3660
|
if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
|
|
3573
3661
|
# Safely set all the previous jobs to FAILED since the cluster
|
|
3574
3662
|
# is restarted
|
|
@@ -3576,14 +3664,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3576
3664
|
# 1. A job finishes RUNNING, but right before it update itself
|
|
3577
3665
|
# to SUCCEEDED, the cluster is STOPPED by `sky stop`.
|
|
3578
3666
|
# 2. On next `sky start`, it gets reset to FAILED.
|
|
3579
|
-
|
|
3580
|
-
|
|
3581
|
-
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
|
|
3586
|
-
|
|
3667
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
3668
|
+
|
|
3669
|
+
if handle.is_grpc_enabled_with_flag:
|
|
3670
|
+
try:
|
|
3671
|
+
fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
|
|
3672
|
+
backend_utils.invoke_skylet_with_retries(
|
|
3673
|
+
lambda: SkyletClient(handle.get_grpc_channel(
|
|
3674
|
+
)).fail_all_in_progress_jobs(fail_request))
|
|
3675
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
3676
|
+
use_legacy = True
|
|
3677
|
+
|
|
3678
|
+
if use_legacy:
|
|
3679
|
+
cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
|
|
3680
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
3681
|
+
handle, cmd, require_outputs=True)
|
|
3682
|
+
subprocess_utils.handle_returncode(
|
|
3683
|
+
returncode, cmd,
|
|
3684
|
+
'Failed to set previously in-progress jobs to FAILED',
|
|
3685
|
+
stdout + stderr)
|
|
3587
3686
|
|
|
3588
3687
|
prev_ports = None
|
|
3589
3688
|
if prev_handle is not None:
|
|
@@ -3942,109 +4041,161 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3942
4041
|
remote_log_dir: Optional[str] = None,
|
|
3943
4042
|
) -> None:
|
|
3944
4043
|
"""Executes generated code on the head node."""
|
|
3945
|
-
|
|
4044
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4045
|
+
file_name = f'sky_job_{job_id}'
|
|
4046
|
+
script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
|
|
3946
4047
|
if remote_log_dir is None:
|
|
3947
4048
|
remote_log_dir = self.log_dir
|
|
3948
4049
|
remote_log_path = os.path.join(remote_log_dir, 'run.log')
|
|
3949
4050
|
|
|
3950
|
-
|
|
4051
|
+
def _dump_code_to_file(codegen: str,
|
|
4052
|
+
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
|
4053
|
+
runners = handle.get_command_runners()
|
|
4054
|
+
head_runner = runners[0]
|
|
4055
|
+
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
|
4056
|
+
fp.write(codegen)
|
|
4057
|
+
fp.flush()
|
|
4058
|
+
script_path = os.path.join(target_dir, file_name)
|
|
4059
|
+
# We choose to sync code + exec, because the alternative of
|
|
4060
|
+
# 'ray submit' may not work as it may use system python
|
|
4061
|
+
# (python2) to execute the script. Happens for AWS.
|
|
4062
|
+
head_runner.rsync(source=fp.name,
|
|
4063
|
+
target=script_path,
|
|
4064
|
+
up=True,
|
|
4065
|
+
stream_logs=False)
|
|
3951
4066
|
|
|
4067
|
+
cd = f'cd {SKY_REMOTE_WORKDIR}'
|
|
3952
4068
|
mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
|
|
3953
4069
|
f'touch {remote_log_path}')
|
|
3954
4070
|
encoded_script = shlex.quote(codegen)
|
|
3955
4071
|
create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
|
|
3956
4072
|
job_submit_cmd = (
|
|
3957
|
-
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
3958
|
-
# with pid is the same driver process.
|
|
4073
|
+
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
4074
|
+
# retrieved with pid is the same driver process.
|
|
3959
4075
|
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
3960
4076
|
f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
3961
4077
|
# Do not use &>, which is not POSIX and may not work.
|
|
3962
4078
|
# Note that the order of ">filename 2>&1" matters.
|
|
3963
4079
|
f'> {remote_log_path} 2>&1')
|
|
3964
|
-
|
|
3965
4080
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
|
3966
4081
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
|
3967
4082
|
|
|
3968
|
-
def _dump_code_to_file(codegen: str,
|
|
3969
|
-
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
|
3970
|
-
runners = handle.get_command_runners()
|
|
3971
|
-
head_runner = runners[0]
|
|
3972
|
-
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
|
3973
|
-
fp.write(codegen)
|
|
3974
|
-
fp.flush()
|
|
3975
|
-
script_path = os.path.join(target_dir, f'sky_job_{job_id}')
|
|
3976
|
-
# We choose to sync code + exec, because the alternative of 'ray
|
|
3977
|
-
# submit' may not work as it may use system python (python2) to
|
|
3978
|
-
# execute the script. Happens for AWS.
|
|
3979
|
-
head_runner.rsync(source=fp.name,
|
|
3980
|
-
target=script_path,
|
|
3981
|
-
up=True,
|
|
3982
|
-
stream_logs=False)
|
|
3983
|
-
|
|
3984
4083
|
# Should also be ealier than _is_command_length_over_limit
|
|
3985
4084
|
# Same reason as in _setup
|
|
3986
4085
|
if self._dump_final_script:
|
|
3987
4086
|
_dump_code_to_file(job_submit_cmd,
|
|
3988
4087
|
constants.PERSISTENT_RUN_SCRIPT_DIR)
|
|
3989
4088
|
|
|
3990
|
-
if
|
|
3991
|
-
|
|
3992
|
-
|
|
3993
|
-
|
|
3994
|
-
|
|
3995
|
-
|
|
3996
|
-
|
|
3997
|
-
|
|
3998
|
-
|
|
3999
|
-
|
|
4000
|
-
|
|
4001
|
-
|
|
4002
|
-
|
|
4003
|
-
|
|
4004
|
-
|
|
4005
|
-
|
|
4006
|
-
|
|
4007
|
-
|
|
4008
|
-
|
|
4009
|
-
|
|
4010
|
-
|
|
4011
|
-
|
|
4012
|
-
|
|
4013
|
-
|
|
4014
|
-
|
|
4089
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4090
|
+
try:
|
|
4091
|
+
managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
|
|
4092
|
+
if managed_job_dag is not None:
|
|
4093
|
+
workspace = skypilot_config.get_active_workspace(
|
|
4094
|
+
force_user_workspace=True)
|
|
4095
|
+
entrypoint = common_utils.get_current_command()
|
|
4096
|
+
|
|
4097
|
+
managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
|
|
4098
|
+
for task_id, task in enumerate(managed_job_dag.tasks):
|
|
4099
|
+
resources_str = backend_utils.get_task_resources_str(
|
|
4100
|
+
task, is_managed_job=True)
|
|
4101
|
+
managed_job_tasks.append(
|
|
4102
|
+
jobsv1_pb2.ManagedJobTask(
|
|
4103
|
+
task_id=task_id,
|
|
4104
|
+
name=task.name,
|
|
4105
|
+
resources_str=resources_str,
|
|
4106
|
+
metadata_json=task.metadata_json))
|
|
4107
|
+
|
|
4108
|
+
managed_job_info = jobsv1_pb2.ManagedJobInfo(
|
|
4109
|
+
name=managed_job_dag.name,
|
|
4110
|
+
pool=managed_job_dag.pool,
|
|
4111
|
+
workspace=workspace,
|
|
4112
|
+
entrypoint=entrypoint,
|
|
4113
|
+
tasks=managed_job_tasks)
|
|
4114
|
+
|
|
4115
|
+
if _is_command_length_over_limit(codegen):
|
|
4116
|
+
_dump_code_to_file(codegen)
|
|
4117
|
+
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
4118
|
+
job_id=job_id,
|
|
4119
|
+
# codegen not set - server assumes script uploaded
|
|
4120
|
+
remote_log_dir=remote_log_dir,
|
|
4121
|
+
managed_job=managed_job_info,
|
|
4122
|
+
script_path=script_path)
|
|
4123
|
+
else:
|
|
4124
|
+
queue_job_request = jobsv1_pb2.QueueJobRequest(
|
|
4125
|
+
job_id=job_id,
|
|
4126
|
+
codegen=codegen,
|
|
4127
|
+
remote_log_dir=remote_log_dir,
|
|
4128
|
+
managed_job=managed_job_info,
|
|
4129
|
+
script_path=script_path)
|
|
4130
|
+
|
|
4131
|
+
backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
|
|
4132
|
+
handle.get_grpc_channel()).queue_job(queue_job_request))
|
|
4133
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4134
|
+
use_legacy = True
|
|
4135
|
+
|
|
4136
|
+
if use_legacy:
|
|
4137
|
+
if _is_command_length_over_limit(job_submit_cmd):
|
|
4138
|
+
_dump_code_to_file(codegen)
|
|
4139
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
4140
|
+
|
|
4141
|
+
def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
|
|
4142
|
+
if managed_job_dag is not None:
|
|
4143
|
+
# Add the managed job to job queue database.
|
|
4144
|
+
managed_job_codegen = managed_jobs.ManagedJobCodeGen()
|
|
4145
|
+
managed_job_code = managed_job_codegen.set_pending(
|
|
4146
|
+
job_id,
|
|
4147
|
+
managed_job_dag,
|
|
4148
|
+
skypilot_config.get_active_workspace(
|
|
4149
|
+
force_user_workspace=True),
|
|
4150
|
+
entrypoint=common_utils.get_current_command())
|
|
4151
|
+
# Set the managed job to PENDING state to make sure that
|
|
4152
|
+
# this managed job appears in the `sky jobs queue`, even
|
|
4153
|
+
# if it needs to wait to be submitted.
|
|
4154
|
+
# We cannot set the managed job to PENDING state in the
|
|
4155
|
+
# job template (jobs-controller.yaml.j2), as it may need
|
|
4156
|
+
# to wait for the run commands to be scheduled on the job
|
|
4157
|
+
# controller in high-load cases.
|
|
4158
|
+
job_submit_cmd += ' && ' + managed_job_code
|
|
4159
|
+
return job_submit_cmd
|
|
4015
4160
|
|
|
4016
|
-
returncode, stdout, stderr = self.run_on_head(handle,
|
|
4017
|
-
job_submit_cmd,
|
|
4018
|
-
stream_logs=False,
|
|
4019
|
-
require_outputs=True)
|
|
4020
|
-
# Happens when someone calls `sky exec` but remote is outdated for
|
|
4021
|
-
# running a job. Necessitating calling `sky launch`.
|
|
4022
|
-
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
4023
|
-
handle.cluster_name)
|
|
4024
|
-
output = stdout + stderr
|
|
4025
|
-
if ((returncode == 255 and 'too long' in output.lower()) or
|
|
4026
|
-
(returncode == 1 and 'request-uri too large' in output.lower())):
|
|
4027
|
-
# If the generated script is too long, we retry it with dumping
|
|
4028
|
-
# the script to a file and running it with SSH. We use a general
|
|
4029
|
-
# length limit check before but it could be inaccurate on some
|
|
4030
|
-
# systems.
|
|
4031
|
-
# When there is a cloudflare proxy in front of the remote, it could
|
|
4032
|
-
# cause `414 Request-URI Too Large` error.
|
|
4033
|
-
logger.debug('Failed to submit job due to command length limit. '
|
|
4034
|
-
'Dumping job to file and running it with SSH. '
|
|
4035
|
-
f'Output: {output}')
|
|
4036
|
-
_dump_code_to_file(codegen)
|
|
4037
|
-
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
4038
4161
|
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
4162
|
+
|
|
4039
4163
|
returncode, stdout, stderr = self.run_on_head(handle,
|
|
4040
4164
|
job_submit_cmd,
|
|
4041
4165
|
stream_logs=False,
|
|
4042
4166
|
require_outputs=True)
|
|
4167
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
|
4168
|
+
# running a job. Necessitating calling `sky launch`.
|
|
4169
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
4170
|
+
handle.cluster_name)
|
|
4171
|
+
output = stdout + stderr
|
|
4172
|
+
if ((returncode == 255 and 'too long' in output.lower()) or
|
|
4173
|
+
(returncode == 1 and
|
|
4174
|
+
'request-uri too large' in output.lower())):
|
|
4175
|
+
# If the generated script is too long, we retry it with dumping
|
|
4176
|
+
# the script to a file and running it with SSH. We use a general
|
|
4177
|
+
# length limit check before but it could be inaccurate on some
|
|
4178
|
+
# systems.
|
|
4179
|
+
# When there is a cloudflare proxy in front of the remote, it
|
|
4180
|
+
# could cause `414 Request-URI Too Large` error.
|
|
4181
|
+
logger.debug(
|
|
4182
|
+
'Failed to submit job due to command length limit. '
|
|
4183
|
+
'Dumping job to file and running it with SSH. '
|
|
4184
|
+
f'Output: {output}')
|
|
4185
|
+
_dump_code_to_file(codegen)
|
|
4186
|
+
job_submit_cmd = f'{mkdir_code} && {code}'
|
|
4187
|
+
job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
|
|
4188
|
+
returncode, stdout, stderr = self.run_on_head(
|
|
4189
|
+
handle,
|
|
4190
|
+
job_submit_cmd,
|
|
4191
|
+
stream_logs=False,
|
|
4192
|
+
require_outputs=True)
|
|
4043
4193
|
|
|
4044
|
-
|
|
4045
|
-
|
|
4046
|
-
|
|
4047
|
-
|
|
4194
|
+
subprocess_utils.handle_returncode(
|
|
4195
|
+
returncode,
|
|
4196
|
+
job_submit_cmd,
|
|
4197
|
+
f'Failed to submit job {job_id}.',
|
|
4198
|
+
stderr=stdout + stderr)
|
|
4048
4199
|
|
|
4049
4200
|
controller = controller_utils.Controllers.from_name(handle.cluster_name)
|
|
4050
4201
|
if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
|
|
@@ -4065,42 +4216,64 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4065
4216
|
def _add_job(self, handle: CloudVmRayResourceHandle,
|
|
4066
4217
|
job_name: Optional[str], resources_str: str,
|
|
4067
4218
|
metadata: str) -> Tuple[int, str]:
|
|
4068
|
-
|
|
4069
|
-
|
|
4070
|
-
|
|
4071
|
-
|
|
4072
|
-
|
|
4073
|
-
|
|
4074
|
-
|
|
4075
|
-
|
|
4076
|
-
|
|
4077
|
-
|
|
4078
|
-
|
|
4079
|
-
|
|
4080
|
-
|
|
4081
|
-
|
|
4082
|
-
|
|
4083
|
-
|
|
4084
|
-
|
|
4085
|
-
|
|
4086
|
-
|
|
4087
|
-
|
|
4088
|
-
|
|
4089
|
-
|
|
4090
|
-
|
|
4091
|
-
|
|
4092
|
-
|
|
4093
|
-
|
|
4094
|
-
|
|
4095
|
-
|
|
4096
|
-
|
|
4097
|
-
|
|
4098
|
-
|
|
4099
|
-
|
|
4100
|
-
|
|
4101
|
-
|
|
4102
|
-
|
|
4103
|
-
|
|
4219
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4220
|
+
|
|
4221
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4222
|
+
try:
|
|
4223
|
+
request = jobsv1_pb2.AddJobRequest(
|
|
4224
|
+
job_name=job_name,
|
|
4225
|
+
username=common_utils.get_user_hash(),
|
|
4226
|
+
run_timestamp=self.run_timestamp,
|
|
4227
|
+
resources_str=resources_str,
|
|
4228
|
+
metadata=metadata)
|
|
4229
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4230
|
+
lambda: SkyletClient(handle.get_grpc_channel()).add_job(
|
|
4231
|
+
request))
|
|
4232
|
+
job_id = response.job_id
|
|
4233
|
+
log_dir = response.log_dir
|
|
4234
|
+
return job_id, log_dir
|
|
4235
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4236
|
+
use_legacy = True
|
|
4237
|
+
|
|
4238
|
+
if use_legacy:
|
|
4239
|
+
code = job_lib.JobLibCodeGen.add_job(
|
|
4240
|
+
job_name=job_name,
|
|
4241
|
+
username=common_utils.get_user_hash(),
|
|
4242
|
+
run_timestamp=self.run_timestamp,
|
|
4243
|
+
resources_str=resources_str,
|
|
4244
|
+
metadata=metadata)
|
|
4245
|
+
returncode, result_str, stderr = self.run_on_head(
|
|
4246
|
+
handle,
|
|
4247
|
+
code,
|
|
4248
|
+
stream_logs=False,
|
|
4249
|
+
require_outputs=True,
|
|
4250
|
+
separate_stderr=True)
|
|
4251
|
+
# Happens when someone calls `sky exec` but remote is outdated for
|
|
4252
|
+
# adding a job. Necessitating calling `sky launch`.
|
|
4253
|
+
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
|
|
4254
|
+
handle.cluster_name)
|
|
4255
|
+
# TODO(zhwu): this sometimes will unexpectedly fail, we can add
|
|
4256
|
+
# retry for this, after we figure out the reason.
|
|
4257
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4258
|
+
'Failed to fetch job id.',
|
|
4259
|
+
stderr)
|
|
4260
|
+
try:
|
|
4261
|
+
job_id_match = _JOB_ID_PATTERN.search(result_str)
|
|
4262
|
+
if job_id_match is not None:
|
|
4263
|
+
job_id = int(job_id_match.group(1))
|
|
4264
|
+
else:
|
|
4265
|
+
# For backward compatibility.
|
|
4266
|
+
job_id = int(result_str)
|
|
4267
|
+
log_dir_match = _LOG_DIR_PATTERN.search(result_str)
|
|
4268
|
+
if log_dir_match is not None:
|
|
4269
|
+
log_dir = log_dir_match.group(1).strip()
|
|
4270
|
+
else:
|
|
4271
|
+
# For backward compatibility, use the same log dir as local.
|
|
4272
|
+
log_dir = self.log_dir
|
|
4273
|
+
except ValueError as e:
|
|
4274
|
+
logger.error(stderr)
|
|
4275
|
+
raise ValueError(f'Failed to parse job id: {result_str}; '
|
|
4276
|
+
f'Returncode: {returncode}') from e
|
|
4104
4277
|
return job_id, log_dir
|
|
4105
4278
|
|
|
4106
4279
|
def _execute(
|
|
@@ -4279,6 +4452,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4279
4452
|
job_ids: Optional[List[int]] = None,
|
|
4280
4453
|
stream_logs: bool = True
|
|
4281
4454
|
) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
|
|
4455
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4456
|
+
try:
|
|
4457
|
+
request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
|
|
4458
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4459
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4460
|
+
).get_job_status(request))
|
|
4461
|
+
statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
|
|
4462
|
+
job_id: job_lib.JobStatus.from_protobuf(proto_status)
|
|
4463
|
+
for job_id, proto_status in response.job_statuses.items()
|
|
4464
|
+
}
|
|
4465
|
+
return statuses
|
|
4466
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4467
|
+
pass
|
|
4468
|
+
|
|
4282
4469
|
code = job_lib.JobLibCodeGen.get_job_status(job_ids)
|
|
4283
4470
|
returncode, stdout, stderr = self.run_on_head(handle,
|
|
4284
4471
|
code,
|
|
@@ -4299,16 +4486,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4299
4486
|
|
|
4300
4487
|
See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
|
|
4301
4488
|
"""
|
|
4302
|
-
|
|
4303
|
-
|
|
4304
|
-
|
|
4305
|
-
|
|
4306
|
-
|
|
4307
|
-
|
|
4308
|
-
|
|
4309
|
-
|
|
4310
|
-
|
|
4311
|
-
|
|
4489
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4490
|
+
|
|
4491
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4492
|
+
try:
|
|
4493
|
+
request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
|
|
4494
|
+
cancel_all=cancel_all,
|
|
4495
|
+
user_hash=user_hash)
|
|
4496
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4497
|
+
lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
|
|
4498
|
+
request))
|
|
4499
|
+
cancelled_ids = response.cancelled_job_ids
|
|
4500
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4501
|
+
use_legacy = True
|
|
4502
|
+
|
|
4503
|
+
if use_legacy:
|
|
4504
|
+
code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
|
|
4505
|
+
user_hash)
|
|
4506
|
+
returncode, stdout, _ = self.run_on_head(handle,
|
|
4507
|
+
code,
|
|
4508
|
+
stream_logs=False,
|
|
4509
|
+
require_outputs=True)
|
|
4510
|
+
subprocess_utils.handle_returncode(
|
|
4511
|
+
returncode, code,
|
|
4512
|
+
f'Failed to cancel jobs on cluster {handle.cluster_name}.',
|
|
4513
|
+
stdout)
|
|
4514
|
+
cancelled_ids = message_utils.decode_payload(stdout)
|
|
4312
4515
|
if cancelled_ids:
|
|
4313
4516
|
logger.info(
|
|
4314
4517
|
f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
|
|
@@ -4325,20 +4528,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4325
4528
|
Returns:
|
|
4326
4529
|
A dictionary mapping job_id to log path.
|
|
4327
4530
|
"""
|
|
4328
|
-
|
|
4329
|
-
|
|
4531
|
+
job_to_dir: Dict[str, str] = {}
|
|
4532
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
4533
|
+
|
|
4534
|
+
if handle.is_grpc_enabled_with_flag:
|
|
4535
|
+
try:
|
|
4536
|
+
int_job_ids = []
|
|
4537
|
+
if job_ids:
|
|
4538
|
+
for str_job_id in job_ids:
|
|
4539
|
+
if str_job_id.isdigit():
|
|
4540
|
+
int_job_ids.append(int(str_job_id))
|
|
4541
|
+
request = jobsv1_pb2.GetLogDirsForJobsRequest(
|
|
4542
|
+
job_ids=int_job_ids)
|
|
4543
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4544
|
+
lambda: SkyletClient(handle.get_grpc_channel()
|
|
4545
|
+
).get_log_dirs_for_jobs(request))
|
|
4546
|
+
job_log_dirs = response.job_log_dirs
|
|
4547
|
+
if not job_log_dirs:
|
|
4548
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4549
|
+
'No matching log directories found'
|
|
4550
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4551
|
+
return {}
|
|
4552
|
+
for job_id, log_dir in job_log_dirs.items():
|
|
4553
|
+
# Convert to string for backwards compatibility
|
|
4554
|
+
job_to_dir[str(job_id)] = log_dir
|
|
4555
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
4556
|
+
use_legacy = True
|
|
4557
|
+
|
|
4558
|
+
if use_legacy:
|
|
4559
|
+
code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
|
|
4560
|
+
returncode, stdout, stderr = self.run_on_head(handle,
|
|
4330
4561
|
code,
|
|
4331
4562
|
stream_logs=False,
|
|
4332
4563
|
require_outputs=True,
|
|
4333
4564
|
separate_stderr=True)
|
|
4334
|
-
|
|
4335
|
-
|
|
4336
|
-
|
|
4337
|
-
|
|
4338
|
-
|
|
4339
|
-
|
|
4340
|
-
|
|
4341
|
-
|
|
4565
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
4566
|
+
'Failed to sync logs.', stderr)
|
|
4567
|
+
job_to_dir = message_utils.decode_payload(stdout)
|
|
4568
|
+
if not job_to_dir:
|
|
4569
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
4570
|
+
'No matching log directories found'
|
|
4571
|
+
f'{colorama.Style.RESET_ALL}')
|
|
4572
|
+
return {}
|
|
4342
4573
|
|
|
4343
4574
|
job_ids = list(job_to_dir.keys())
|
|
4344
4575
|
dirs = list(job_to_dir.values())
|
|
@@ -4615,11 +4846,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4615
4846
|
exist_ok=True)
|
|
4616
4847
|
log_file = os.path.join(local_log_dir, 'run.log')
|
|
4617
4848
|
|
|
4618
|
-
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4619
|
-
|
|
4620
|
-
|
|
4621
|
-
|
|
4622
|
-
|
|
4849
|
+
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
|
4850
|
+
job_name=None,
|
|
4851
|
+
job_id=int(job_id),
|
|
4852
|
+
follow=False,
|
|
4853
|
+
controller=False)
|
|
4623
4854
|
# With the stdin=subprocess.DEVNULL, the ctrl-c will not
|
|
4624
4855
|
# kill the process, so we need to handle it manually here.
|
|
4625
4856
|
if threading.current_thread() is threading.main_thread():
|