skypilot-nightly 1.0.0.dev20250912__py3-none-any.whl → 1.0.0.dev20250914__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (73) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/seeweb.py +103 -0
  3. sky/authentication.py +38 -0
  4. sky/backends/backend_utils.py +24 -9
  5. sky/backends/cloud_vm_ray_backend.py +382 -151
  6. sky/catalog/data_fetchers/fetch_aws.py +0 -36
  7. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  8. sky/catalog/seeweb_catalog.py +184 -0
  9. sky/clouds/__init__.py +2 -0
  10. sky/clouds/kubernetes.py +2 -0
  11. sky/clouds/seeweb.py +463 -0
  12. sky/core.py +46 -12
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/{DAiq7V2xJnO1LSfmunZl6 → 5iak5kYp9a9ezANCb74L8}/_buildManifest.js +1 -1
  15. sky/dashboard/out/_next/static/chunks/1141-159df2d4c441a9d1.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/3294.03e02ae73455f48e.js +6 -0
  18. sky/dashboard/out/_next/static/chunks/3785.0fa442e16dd3f00e.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/5339.c033b29835da0f35.js +51 -0
  20. sky/dashboard/out/_next/static/chunks/6856-e0754534b3015377.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/6990-11c8e9b982e8ffec.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +6 -0
  23. sky/dashboard/out/_next/static/chunks/{webpack-e8a0c4c3c6f408fb.js → webpack-e2e3d2d3de7d43e5.js} +1 -1
  24. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  25. sky/dashboard/out/clusters/[cluster].html +1 -1
  26. sky/dashboard/out/clusters.html +1 -1
  27. sky/dashboard/out/config.html +1 -1
  28. sky/dashboard/out/index.html +1 -1
  29. sky/dashboard/out/infra/[context].html +1 -1
  30. sky/dashboard/out/infra.html +1 -1
  31. sky/dashboard/out/jobs/[job].html +1 -1
  32. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  33. sky/dashboard/out/jobs.html +1 -1
  34. sky/dashboard/out/users.html +1 -1
  35. sky/dashboard/out/volumes.html +1 -1
  36. sky/dashboard/out/workspace/new.html +1 -1
  37. sky/dashboard/out/workspaces/[name].html +1 -1
  38. sky/dashboard/out/workspaces.html +1 -1
  39. sky/exceptions.py +5 -0
  40. sky/global_user_state.py +41 -26
  41. sky/jobs/utils.py +61 -13
  42. sky/provision/__init__.py +1 -0
  43. sky/provision/kubernetes/utils.py +14 -3
  44. sky/provision/seeweb/__init__.py +11 -0
  45. sky/provision/seeweb/config.py +13 -0
  46. sky/provision/seeweb/instance.py +806 -0
  47. sky/schemas/generated/jobsv1_pb2.py +86 -0
  48. sky/schemas/generated/jobsv1_pb2.pyi +252 -0
  49. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  50. sky/setup_files/dependencies.py +8 -1
  51. sky/skylet/constants.py +2 -1
  52. sky/skylet/job_lib.py +128 -10
  53. sky/skylet/log_lib.py +3 -3
  54. sky/skylet/services.py +203 -0
  55. sky/skylet/skylet.py +4 -0
  56. sky/templates/seeweb-ray.yml.j2 +108 -0
  57. sky/utils/cluster_utils.py +6 -2
  58. sky/utils/controller_utils.py +11 -5
  59. {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/METADATA +39 -34
  60. {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/RECORD +65 -54
  61. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  62. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  63. sky/dashboard/out/_next/static/chunks/3294.ba6586f9755b0edb.js +0 -6
  64. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  65. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  66. sky/dashboard/out/_next/static/chunks/6856-6e2bc8a6fd0867af.js +0 -1
  67. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  68. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  69. /sky/dashboard/out/_next/static/{DAiq7V2xJnO1LSfmunZl6 → 5iak5kYp9a9ezANCb74L8}/_ssgManifest.js +0 -0
  70. {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/WHEEL +0 -0
  71. {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/entry_points.txt +0 -0
  72. {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/licenses/LICENSE +0 -0
  73. {skypilot_nightly-1.0.0.dev20250912.dist-info → skypilot_nightly-1.0.0.dev20250914.dist-info}/top_level.txt +0 -0
@@ -88,6 +88,8 @@ if typing.TYPE_CHECKING:
88
88
  from sky import dag
89
89
  from sky.schemas.generated import autostopv1_pb2
90
90
  from sky.schemas.generated import autostopv1_pb2_grpc
91
+ from sky.schemas.generated import jobsv1_pb2
92
+ from sky.schemas.generated import jobsv1_pb2_grpc
91
93
  else:
92
94
  # To avoid requiring grpcio to be installed on the client side.
93
95
  grpc = adaptors_common.LazyImport(
@@ -99,6 +101,9 @@ else:
99
101
  'sky.schemas.generated.autostopv1_pb2')
100
102
  autostopv1_pb2_grpc = adaptors_common.LazyImport(
101
103
  'sky.schemas.generated.autostopv1_pb2_grpc')
104
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
105
+ jobsv1_pb2_grpc = adaptors_common.LazyImport(
106
+ 'sky.schemas.generated.jobsv1_pb2_grpc')
102
107
 
103
108
  Path = str
104
109
 
@@ -225,7 +230,8 @@ def _get_cluster_config_template(cloud):
225
230
  clouds.Vast: 'vast-ray.yml.j2',
226
231
  clouds.Fluidstack: 'fluidstack-ray.yml.j2',
227
232
  clouds.Nebius: 'nebius-ray.yml.j2',
228
- clouds.Hyperbolic: 'hyperbolic-ray.yml.j2'
233
+ clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
234
+ clouds.Seeweb: 'seeweb-ray.yml.j2'
229
235
  }
230
236
  return cloud_to_template[type(cloud)]
231
237
 
@@ -3038,21 +3044,93 @@ class SkyletClient:
3038
3044
 
3039
3045
  def __init__(self, channel: 'grpc.Channel'):
3040
3046
  self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
3047
+ self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
3041
3048
 
3042
3049
  def set_autostop(
3043
3050
  self,
3044
3051
  request: 'autostopv1_pb2.SetAutostopRequest',
3045
- timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3052
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3046
3053
  ) -> 'autostopv1_pb2.SetAutostopResponse':
3047
3054
  return self._autostop_stub.SetAutostop(request, timeout=timeout)
3048
3055
 
3049
3056
  def is_autostopping(
3050
3057
  self,
3051
3058
  request: 'autostopv1_pb2.IsAutostoppingRequest',
3052
- timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3059
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3053
3060
  ) -> 'autostopv1_pb2.IsAutostoppingResponse':
3054
3061
  return self._autostop_stub.IsAutostopping(request, timeout=timeout)
3055
3062
 
3063
+ def add_job(
3064
+ self,
3065
+ request: 'jobsv1_pb2.AddJobRequest',
3066
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3067
+ ) -> 'jobsv1_pb2.AddJobResponse':
3068
+ return self._jobs_stub.AddJob(request, timeout=timeout)
3069
+
3070
+ def queue_job(
3071
+ self,
3072
+ request: 'jobsv1_pb2.QueueJobRequest',
3073
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3074
+ ) -> 'jobsv1_pb2.QueueJobResponse':
3075
+ return self._jobs_stub.QueueJob(request, timeout=timeout)
3076
+
3077
+ def update_status(
3078
+ self,
3079
+ request: 'jobsv1_pb2.UpdateStatusRequest',
3080
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3081
+ ) -> 'jobsv1_pb2.UpdateStatusResponse':
3082
+ return self._jobs_stub.UpdateStatus(request, timeout=timeout)
3083
+
3084
+ def get_job_queue(
3085
+ self,
3086
+ request: 'jobsv1_pb2.GetJobQueueRequest',
3087
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3088
+ ) -> 'jobsv1_pb2.GetJobQueueResponse':
3089
+ return self._jobs_stub.GetJobQueue(request, timeout=timeout)
3090
+
3091
+ def cancel_jobs(
3092
+ self,
3093
+ request: 'jobsv1_pb2.CancelJobsRequest',
3094
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3095
+ ) -> 'jobsv1_pb2.CancelJobsResponse':
3096
+ return self._jobs_stub.CancelJobs(request, timeout=timeout)
3097
+
3098
+ def fail_all_in_progress_jobs(
3099
+ self,
3100
+ request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
3101
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3102
+ ) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
3103
+ return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
3104
+
3105
+ def get_job_status(
3106
+ self,
3107
+ request: 'jobsv1_pb2.GetJobStatusRequest',
3108
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3109
+ ) -> 'jobsv1_pb2.GetJobStatusResponse':
3110
+ return self._jobs_stub.GetJobStatus(request, timeout=timeout)
3111
+
3112
+ def get_job_submitted_timestamp(
3113
+ self,
3114
+ request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
3115
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3116
+ ) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
3117
+ return self._jobs_stub.GetJobSubmittedTimestamp(request,
3118
+ timeout=timeout)
3119
+
3120
+ def get_job_ended_timestamp(
3121
+ self,
3122
+ request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
3123
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3124
+ ) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
3125
+ return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
3126
+
3127
+ def get_log_dirs_for_jobs(
3128
+ self,
3129
+ request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
3130
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3131
+ ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
3132
+ return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
3133
+
3056
3134
 
3057
3135
  @registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
3058
3136
  class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
@@ -3559,16 +3637,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3559
3637
  # update_status will query the ray job status for all INIT /
3560
3638
  # PENDING / RUNNING jobs for the real status, since we do not
3561
3639
  # know the actual previous status of the cluster.
3562
- cmd = job_lib.JobLibCodeGen.update_status()
3563
3640
  logger.debug('Update job queue on remote cluster.')
3564
3641
  with rich_utils.safe_status(
3565
3642
  ux_utils.spinner_message('Preparing SkyPilot runtime')):
3566
- returncode, _, stderr = self.run_on_head(handle,
3567
- cmd,
3568
- require_outputs=True)
3569
- subprocess_utils.handle_returncode(returncode, cmd,
3570
- 'Failed to update job status.',
3571
- stderr)
3643
+ use_legacy = not handle.is_grpc_enabled_with_flag
3644
+
3645
+ if handle.is_grpc_enabled_with_flag:
3646
+ try:
3647
+ request = jobsv1_pb2.UpdateStatusRequest()
3648
+ backend_utils.invoke_skylet_with_retries(
3649
+ lambda: SkyletClient(handle.get_grpc_channel()
3650
+ ).update_status(request))
3651
+ except exceptions.SkyletMethodNotImplementedError:
3652
+ use_legacy = True
3653
+
3654
+ if use_legacy:
3655
+ cmd = job_lib.JobLibCodeGen.update_status()
3656
+ returncode, _, stderr = self.run_on_head(
3657
+ handle, cmd, require_outputs=True)
3658
+ subprocess_utils.handle_returncode(
3659
+ returncode, cmd, 'Failed to update job status.', stderr)
3572
3660
  if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
3573
3661
  # Safely set all the previous jobs to FAILED since the cluster
3574
3662
  # is restarted
@@ -3576,14 +3664,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3576
3664
  # 1. A job finishes RUNNING, but right before it update itself
3577
3665
  # to SUCCEEDED, the cluster is STOPPED by `sky stop`.
3578
3666
  # 2. On next `sky start`, it gets reset to FAILED.
3579
- cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
3580
- returncode, stdout, stderr = self.run_on_head(handle,
3581
- cmd,
3582
- require_outputs=True)
3583
- subprocess_utils.handle_returncode(
3584
- returncode, cmd,
3585
- 'Failed to set previously in-progress jobs to FAILED',
3586
- stdout + stderr)
3667
+ use_legacy = not handle.is_grpc_enabled_with_flag
3668
+
3669
+ if handle.is_grpc_enabled_with_flag:
3670
+ try:
3671
+ fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
3672
+ backend_utils.invoke_skylet_with_retries(
3673
+ lambda: SkyletClient(handle.get_grpc_channel(
3674
+ )).fail_all_in_progress_jobs(fail_request))
3675
+ except exceptions.SkyletMethodNotImplementedError:
3676
+ use_legacy = True
3677
+
3678
+ if use_legacy:
3679
+ cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
3680
+ returncode, stdout, stderr = self.run_on_head(
3681
+ handle, cmd, require_outputs=True)
3682
+ subprocess_utils.handle_returncode(
3683
+ returncode, cmd,
3684
+ 'Failed to set previously in-progress jobs to FAILED',
3685
+ stdout + stderr)
3587
3686
 
3588
3687
  prev_ports = None
3589
3688
  if prev_handle is not None:
@@ -3942,109 +4041,161 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3942
4041
  remote_log_dir: Optional[str] = None,
3943
4042
  ) -> None:
3944
4043
  """Executes generated code on the head node."""
3945
- script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
4044
+ use_legacy = not handle.is_grpc_enabled_with_flag
4045
+ file_name = f'sky_job_{job_id}'
4046
+ script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
3946
4047
  if remote_log_dir is None:
3947
4048
  remote_log_dir = self.log_dir
3948
4049
  remote_log_path = os.path.join(remote_log_dir, 'run.log')
3949
4050
 
3950
- cd = f'cd {SKY_REMOTE_WORKDIR}'
4051
+ def _dump_code_to_file(codegen: str,
4052
+ target_dir: str = SKY_REMOTE_APP_DIR) -> None:
4053
+ runners = handle.get_command_runners()
4054
+ head_runner = runners[0]
4055
+ with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
4056
+ fp.write(codegen)
4057
+ fp.flush()
4058
+ script_path = os.path.join(target_dir, file_name)
4059
+ # We choose to sync code + exec, because the alternative of
4060
+ # 'ray submit' may not work as it may use system python
4061
+ # (python2) to execute the script. Happens for AWS.
4062
+ head_runner.rsync(source=fp.name,
4063
+ target=script_path,
4064
+ up=True,
4065
+ stream_logs=False)
3951
4066
 
4067
+ cd = f'cd {SKY_REMOTE_WORKDIR}'
3952
4068
  mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
3953
4069
  f'touch {remote_log_path}')
3954
4070
  encoded_script = shlex.quote(codegen)
3955
4071
  create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
3956
4072
  job_submit_cmd = (
3957
- # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
3958
- # with pid is the same driver process.
4073
+ # JOB_CMD_IDENTIFIER is used for identifying the process
4074
+ # retrieved with pid is the same driver process.
3959
4075
  f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
3960
4076
  f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
3961
4077
  # Do not use &>, which is not POSIX and may not work.
3962
4078
  # Note that the order of ">filename 2>&1" matters.
3963
4079
  f'> {remote_log_path} 2>&1')
3964
-
3965
4080
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3966
4081
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
3967
4082
 
3968
- def _dump_code_to_file(codegen: str,
3969
- target_dir: str = SKY_REMOTE_APP_DIR) -> None:
3970
- runners = handle.get_command_runners()
3971
- head_runner = runners[0]
3972
- with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
3973
- fp.write(codegen)
3974
- fp.flush()
3975
- script_path = os.path.join(target_dir, f'sky_job_{job_id}')
3976
- # We choose to sync code + exec, because the alternative of 'ray
3977
- # submit' may not work as it may use system python (python2) to
3978
- # execute the script. Happens for AWS.
3979
- head_runner.rsync(source=fp.name,
3980
- target=script_path,
3981
- up=True,
3982
- stream_logs=False)
3983
-
3984
4083
  # Should also be ealier than _is_command_length_over_limit
3985
4084
  # Same reason as in _setup
3986
4085
  if self._dump_final_script:
3987
4086
  _dump_code_to_file(job_submit_cmd,
3988
4087
  constants.PERSISTENT_RUN_SCRIPT_DIR)
3989
4088
 
3990
- if _is_command_length_over_limit(job_submit_cmd):
3991
- _dump_code_to_file(codegen)
3992
- job_submit_cmd = f'{mkdir_code} && {code}'
3993
-
3994
- def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
3995
- if managed_job_dag is not None:
3996
- # Add the managed job to job queue database.
3997
- managed_job_codegen = managed_jobs.ManagedJobCodeGen()
3998
- managed_job_code = managed_job_codegen.set_pending(
3999
- job_id,
4000
- managed_job_dag,
4001
- skypilot_config.get_active_workspace(
4002
- force_user_workspace=True),
4003
- entrypoint=common_utils.get_current_command())
4004
- # Set the managed job to PENDING state to make sure that this
4005
- # managed job appears in the `sky jobs queue`, even if it needs
4006
- # to wait to be submitted.
4007
- # We cannot set the managed job to PENDING state in the job
4008
- # template (jobs-controller.yaml.j2), as it may need to wait for
4009
- # the run commands to be scheduled on the job controller in
4010
- # high-load cases.
4011
- job_submit_cmd += ' && ' + managed_job_code
4012
- return job_submit_cmd
4013
-
4014
- job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
4089
+ if handle.is_grpc_enabled_with_flag:
4090
+ try:
4091
+ managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
4092
+ if managed_job_dag is not None:
4093
+ workspace = skypilot_config.get_active_workspace(
4094
+ force_user_workspace=True)
4095
+ entrypoint = common_utils.get_current_command()
4096
+
4097
+ managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
4098
+ for task_id, task in enumerate(managed_job_dag.tasks):
4099
+ resources_str = backend_utils.get_task_resources_str(
4100
+ task, is_managed_job=True)
4101
+ managed_job_tasks.append(
4102
+ jobsv1_pb2.ManagedJobTask(
4103
+ task_id=task_id,
4104
+ name=task.name,
4105
+ resources_str=resources_str,
4106
+ metadata_json=task.metadata_json))
4107
+
4108
+ managed_job_info = jobsv1_pb2.ManagedJobInfo(
4109
+ name=managed_job_dag.name,
4110
+ pool=managed_job_dag.pool,
4111
+ workspace=workspace,
4112
+ entrypoint=entrypoint,
4113
+ tasks=managed_job_tasks)
4114
+
4115
+ if _is_command_length_over_limit(codegen):
4116
+ _dump_code_to_file(codegen)
4117
+ queue_job_request = jobsv1_pb2.QueueJobRequest(
4118
+ job_id=job_id,
4119
+ # codegen not set - server assumes script uploaded
4120
+ remote_log_dir=remote_log_dir,
4121
+ managed_job=managed_job_info,
4122
+ script_path=script_path)
4123
+ else:
4124
+ queue_job_request = jobsv1_pb2.QueueJobRequest(
4125
+ job_id=job_id,
4126
+ codegen=codegen,
4127
+ remote_log_dir=remote_log_dir,
4128
+ managed_job=managed_job_info,
4129
+ script_path=script_path)
4130
+
4131
+ backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
4132
+ handle.get_grpc_channel()).queue_job(queue_job_request))
4133
+ except exceptions.SkyletMethodNotImplementedError:
4134
+ use_legacy = True
4135
+
4136
+ if use_legacy:
4137
+ if _is_command_length_over_limit(job_submit_cmd):
4138
+ _dump_code_to_file(codegen)
4139
+ job_submit_cmd = f'{mkdir_code} && {code}'
4140
+
4141
+ def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
4142
+ if managed_job_dag is not None:
4143
+ # Add the managed job to job queue database.
4144
+ managed_job_codegen = managed_jobs.ManagedJobCodeGen()
4145
+ managed_job_code = managed_job_codegen.set_pending(
4146
+ job_id,
4147
+ managed_job_dag,
4148
+ skypilot_config.get_active_workspace(
4149
+ force_user_workspace=True),
4150
+ entrypoint=common_utils.get_current_command())
4151
+ # Set the managed job to PENDING state to make sure that
4152
+ # this managed job appears in the `sky jobs queue`, even
4153
+ # if it needs to wait to be submitted.
4154
+ # We cannot set the managed job to PENDING state in the
4155
+ # job template (jobs-controller.yaml.j2), as it may need
4156
+ # to wait for the run commands to be scheduled on the job
4157
+ # controller in high-load cases.
4158
+ job_submit_cmd += ' && ' + managed_job_code
4159
+ return job_submit_cmd
4015
4160
 
4016
- returncode, stdout, stderr = self.run_on_head(handle,
4017
- job_submit_cmd,
4018
- stream_logs=False,
4019
- require_outputs=True)
4020
- # Happens when someone calls `sky exec` but remote is outdated for
4021
- # running a job. Necessitating calling `sky launch`.
4022
- backend_utils.check_stale_runtime_on_remote(returncode, stderr,
4023
- handle.cluster_name)
4024
- output = stdout + stderr
4025
- if ((returncode == 255 and 'too long' in output.lower()) or
4026
- (returncode == 1 and 'request-uri too large' in output.lower())):
4027
- # If the generated script is too long, we retry it with dumping
4028
- # the script to a file and running it with SSH. We use a general
4029
- # length limit check before but it could be inaccurate on some
4030
- # systems.
4031
- # When there is a cloudflare proxy in front of the remote, it could
4032
- # cause `414 Request-URI Too Large` error.
4033
- logger.debug('Failed to submit job due to command length limit. '
4034
- 'Dumping job to file and running it with SSH. '
4035
- f'Output: {output}')
4036
- _dump_code_to_file(codegen)
4037
- job_submit_cmd = f'{mkdir_code} && {code}'
4038
4161
  job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
4162
+
4039
4163
  returncode, stdout, stderr = self.run_on_head(handle,
4040
4164
  job_submit_cmd,
4041
4165
  stream_logs=False,
4042
4166
  require_outputs=True)
4167
+ # Happens when someone calls `sky exec` but remote is outdated for
4168
+ # running a job. Necessitating calling `sky launch`.
4169
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
4170
+ handle.cluster_name)
4171
+ output = stdout + stderr
4172
+ if ((returncode == 255 and 'too long' in output.lower()) or
4173
+ (returncode == 1 and
4174
+ 'request-uri too large' in output.lower())):
4175
+ # If the generated script is too long, we retry it with dumping
4176
+ # the script to a file and running it with SSH. We use a general
4177
+ # length limit check before but it could be inaccurate on some
4178
+ # systems.
4179
+ # When there is a cloudflare proxy in front of the remote, it
4180
+ # could cause `414 Request-URI Too Large` error.
4181
+ logger.debug(
4182
+ 'Failed to submit job due to command length limit. '
4183
+ 'Dumping job to file and running it with SSH. '
4184
+ f'Output: {output}')
4185
+ _dump_code_to_file(codegen)
4186
+ job_submit_cmd = f'{mkdir_code} && {code}'
4187
+ job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
4188
+ returncode, stdout, stderr = self.run_on_head(
4189
+ handle,
4190
+ job_submit_cmd,
4191
+ stream_logs=False,
4192
+ require_outputs=True)
4043
4193
 
4044
- subprocess_utils.handle_returncode(returncode,
4045
- job_submit_cmd,
4046
- f'Failed to submit job {job_id}.',
4047
- stderr=stdout + stderr)
4194
+ subprocess_utils.handle_returncode(
4195
+ returncode,
4196
+ job_submit_cmd,
4197
+ f'Failed to submit job {job_id}.',
4198
+ stderr=stdout + stderr)
4048
4199
 
4049
4200
  controller = controller_utils.Controllers.from_name(handle.cluster_name)
4050
4201
  if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
@@ -4065,42 +4216,64 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4065
4216
  def _add_job(self, handle: CloudVmRayResourceHandle,
4066
4217
  job_name: Optional[str], resources_str: str,
4067
4218
  metadata: str) -> Tuple[int, str]:
4068
- code = job_lib.JobLibCodeGen.add_job(
4069
- job_name=job_name,
4070
- username=common_utils.get_user_hash(),
4071
- run_timestamp=self.run_timestamp,
4072
- resources_str=resources_str,
4073
- metadata=metadata)
4074
- returncode, result_str, stderr = self.run_on_head(handle,
4075
- code,
4076
- stream_logs=False,
4077
- require_outputs=True,
4078
- separate_stderr=True)
4079
- # Happens when someone calls `sky exec` but remote is outdated for
4080
- # adding a job. Necessitating calling `sky launch`.
4081
- backend_utils.check_stale_runtime_on_remote(returncode, stderr,
4082
- handle.cluster_name)
4083
- # TODO(zhwu): this sometimes will unexpectedly fail, we can add
4084
- # retry for this, after we figure out the reason.
4085
- subprocess_utils.handle_returncode(returncode, code,
4086
- 'Failed to fetch job id.', stderr)
4087
- try:
4088
- job_id_match = _JOB_ID_PATTERN.search(result_str)
4089
- if job_id_match is not None:
4090
- job_id = int(job_id_match.group(1))
4091
- else:
4092
- # For backward compatibility.
4093
- job_id = int(result_str)
4094
- log_dir_match = _LOG_DIR_PATTERN.search(result_str)
4095
- if log_dir_match is not None:
4096
- log_dir = log_dir_match.group(1).strip()
4097
- else:
4098
- # For backward compatibility, use the same log dir as local.
4099
- log_dir = self.log_dir
4100
- except ValueError as e:
4101
- logger.error(stderr)
4102
- raise ValueError(f'Failed to parse job id: {result_str}; '
4103
- f'Returncode: {returncode}') from e
4219
+ use_legacy = not handle.is_grpc_enabled_with_flag
4220
+
4221
+ if handle.is_grpc_enabled_with_flag:
4222
+ try:
4223
+ request = jobsv1_pb2.AddJobRequest(
4224
+ job_name=job_name,
4225
+ username=common_utils.get_user_hash(),
4226
+ run_timestamp=self.run_timestamp,
4227
+ resources_str=resources_str,
4228
+ metadata=metadata)
4229
+ response = backend_utils.invoke_skylet_with_retries(
4230
+ lambda: SkyletClient(handle.get_grpc_channel()).add_job(
4231
+ request))
4232
+ job_id = response.job_id
4233
+ log_dir = response.log_dir
4234
+ return job_id, log_dir
4235
+ except exceptions.SkyletMethodNotImplementedError:
4236
+ use_legacy = True
4237
+
4238
+ if use_legacy:
4239
+ code = job_lib.JobLibCodeGen.add_job(
4240
+ job_name=job_name,
4241
+ username=common_utils.get_user_hash(),
4242
+ run_timestamp=self.run_timestamp,
4243
+ resources_str=resources_str,
4244
+ metadata=metadata)
4245
+ returncode, result_str, stderr = self.run_on_head(
4246
+ handle,
4247
+ code,
4248
+ stream_logs=False,
4249
+ require_outputs=True,
4250
+ separate_stderr=True)
4251
+ # Happens when someone calls `sky exec` but remote is outdated for
4252
+ # adding a job. Necessitating calling `sky launch`.
4253
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
4254
+ handle.cluster_name)
4255
+ # TODO(zhwu): this sometimes will unexpectedly fail, we can add
4256
+ # retry for this, after we figure out the reason.
4257
+ subprocess_utils.handle_returncode(returncode, code,
4258
+ 'Failed to fetch job id.',
4259
+ stderr)
4260
+ try:
4261
+ job_id_match = _JOB_ID_PATTERN.search(result_str)
4262
+ if job_id_match is not None:
4263
+ job_id = int(job_id_match.group(1))
4264
+ else:
4265
+ # For backward compatibility.
4266
+ job_id = int(result_str)
4267
+ log_dir_match = _LOG_DIR_PATTERN.search(result_str)
4268
+ if log_dir_match is not None:
4269
+ log_dir = log_dir_match.group(1).strip()
4270
+ else:
4271
+ # For backward compatibility, use the same log dir as local.
4272
+ log_dir = self.log_dir
4273
+ except ValueError as e:
4274
+ logger.error(stderr)
4275
+ raise ValueError(f'Failed to parse job id: {result_str}; '
4276
+ f'Returncode: {returncode}') from e
4104
4277
  return job_id, log_dir
4105
4278
 
4106
4279
  def _execute(
@@ -4279,6 +4452,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4279
4452
  job_ids: Optional[List[int]] = None,
4280
4453
  stream_logs: bool = True
4281
4454
  ) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
4455
+ if handle.is_grpc_enabled_with_flag:
4456
+ try:
4457
+ request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
4458
+ response = backend_utils.invoke_skylet_with_retries(
4459
+ lambda: SkyletClient(handle.get_grpc_channel()
4460
+ ).get_job_status(request))
4461
+ statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
4462
+ job_id: job_lib.JobStatus.from_protobuf(proto_status)
4463
+ for job_id, proto_status in response.job_statuses.items()
4464
+ }
4465
+ return statuses
4466
+ except exceptions.SkyletMethodNotImplementedError:
4467
+ pass
4468
+
4282
4469
  code = job_lib.JobLibCodeGen.get_job_status(job_ids)
4283
4470
  returncode, stdout, stderr = self.run_on_head(handle,
4284
4471
  code,
@@ -4299,16 +4486,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4299
4486
 
4300
4487
  See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
4301
4488
  """
4302
- code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
4303
- returncode, stdout, _ = self.run_on_head(handle,
4304
- code,
4305
- stream_logs=False,
4306
- require_outputs=True)
4307
- subprocess_utils.handle_returncode(
4308
- returncode, code,
4309
- f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
4310
-
4311
- cancelled_ids = message_utils.decode_payload(stdout)
4489
+ use_legacy = not handle.is_grpc_enabled_with_flag
4490
+
4491
+ if handle.is_grpc_enabled_with_flag:
4492
+ try:
4493
+ request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
4494
+ cancel_all=cancel_all,
4495
+ user_hash=user_hash)
4496
+ response = backend_utils.invoke_skylet_with_retries(
4497
+ lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
4498
+ request))
4499
+ cancelled_ids = response.cancelled_job_ids
4500
+ except exceptions.SkyletMethodNotImplementedError:
4501
+ use_legacy = True
4502
+
4503
+ if use_legacy:
4504
+ code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
4505
+ user_hash)
4506
+ returncode, stdout, _ = self.run_on_head(handle,
4507
+ code,
4508
+ stream_logs=False,
4509
+ require_outputs=True)
4510
+ subprocess_utils.handle_returncode(
4511
+ returncode, code,
4512
+ f'Failed to cancel jobs on cluster {handle.cluster_name}.',
4513
+ stdout)
4514
+ cancelled_ids = message_utils.decode_payload(stdout)
4312
4515
  if cancelled_ids:
4313
4516
  logger.info(
4314
4517
  f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
@@ -4325,20 +4528,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4325
4528
  Returns:
4326
4529
  A dictionary mapping job_id to log path.
4327
4530
  """
4328
- code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
4329
- returncode, job_to_dir, stderr = self.run_on_head(handle,
4531
+ job_to_dir: Dict[str, str] = {}
4532
+ use_legacy = not handle.is_grpc_enabled_with_flag
4533
+
4534
+ if handle.is_grpc_enabled_with_flag:
4535
+ try:
4536
+ int_job_ids = []
4537
+ if job_ids:
4538
+ for str_job_id in job_ids:
4539
+ if str_job_id.isdigit():
4540
+ int_job_ids.append(int(str_job_id))
4541
+ request = jobsv1_pb2.GetLogDirsForJobsRequest(
4542
+ job_ids=int_job_ids)
4543
+ response = backend_utils.invoke_skylet_with_retries(
4544
+ lambda: SkyletClient(handle.get_grpc_channel()
4545
+ ).get_log_dirs_for_jobs(request))
4546
+ job_log_dirs = response.job_log_dirs
4547
+ if not job_log_dirs:
4548
+ logger.info(f'{colorama.Fore.YELLOW}'
4549
+ 'No matching log directories found'
4550
+ f'{colorama.Style.RESET_ALL}')
4551
+ return {}
4552
+ for job_id, log_dir in job_log_dirs.items():
4553
+ # Convert to string for backwards compatibility
4554
+ job_to_dir[str(job_id)] = log_dir
4555
+ except exceptions.SkyletMethodNotImplementedError:
4556
+ use_legacy = True
4557
+
4558
+ if use_legacy:
4559
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
4560
+ returncode, stdout, stderr = self.run_on_head(handle,
4330
4561
  code,
4331
4562
  stream_logs=False,
4332
4563
  require_outputs=True,
4333
4564
  separate_stderr=True)
4334
- subprocess_utils.handle_returncode(returncode, code,
4335
- 'Failed to sync logs.', stderr)
4336
- job_to_dir: Dict[str, str] = message_utils.decode_payload(job_to_dir)
4337
- if not job_to_dir:
4338
- logger.info(f'{colorama.Fore.YELLOW}'
4339
- 'No matching log directories found'
4340
- f'{colorama.Style.RESET_ALL}')
4341
- return {}
4565
+ subprocess_utils.handle_returncode(returncode, code,
4566
+ 'Failed to sync logs.', stderr)
4567
+ job_to_dir = message_utils.decode_payload(stdout)
4568
+ if not job_to_dir:
4569
+ logger.info(f'{colorama.Fore.YELLOW}'
4570
+ 'No matching log directories found'
4571
+ f'{colorama.Style.RESET_ALL}')
4572
+ return {}
4342
4573
 
4343
4574
  job_ids = list(job_to_dir.keys())
4344
4575
  dirs = list(job_to_dir.values())
@@ -4615,11 +4846,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4615
4846
  exist_ok=True)
4616
4847
  log_file = os.path.join(local_log_dir, 'run.log')
4617
4848
 
4618
- code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
4619
- job_id=job_id,
4620
- follow=False,
4621
- controller=False)
4622
-
4849
+ code = managed_jobs.ManagedJobCodeGen.stream_logs(
4850
+ job_name=None,
4851
+ job_id=int(job_id),
4852
+ follow=False,
4853
+ controller=False)
4623
4854
  # With the stdin=subprocess.DEVNULL, the ctrl-c will not
4624
4855
  # kill the process, so we need to handle it manually here.
4625
4856
  if threading.current_thread() is threading.main_thread():