skypilot-nightly 1.0.0.dev20250926__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (57) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +25 -4
  3. sky/backends/cloud_vm_ray_backend.py +151 -36
  4. sky/client/cli/command.py +2 -1
  5. sky/client/cli/table_utils.py +34 -0
  6. sky/client/sdk.py +7 -5
  7. sky/client/sdk_async.py +5 -5
  8. sky/core.py +3 -4
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
  11. sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{webpack-8e64d11e58eab5cb.js → webpack-7340bc0f0dd8ae74.js} +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/config.html +1 -1
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/infra/[context].html +1 -1
  20. sky/dashboard/out/infra.html +1 -1
  21. sky/dashboard/out/jobs/[job].html +1 -1
  22. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  23. sky/dashboard/out/jobs.html +1 -1
  24. sky/dashboard/out/users.html +1 -1
  25. sky/dashboard/out/volumes.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/execution.py +0 -1
  30. sky/global_user_state.py +3 -3
  31. sky/jobs/server/core.py +96 -26
  32. sky/jobs/server/utils.py +65 -32
  33. sky/jobs/state.py +145 -3
  34. sky/jobs/utils.py +85 -7
  35. sky/schemas/api/responses.py +18 -0
  36. sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
  37. sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
  38. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  39. sky/serve/serve_utils.py +16 -0
  40. sky/serve/server/core.py +1 -1
  41. sky/serve/server/impl.py +6 -6
  42. sky/server/requests/serializers/decoders.py +2 -2
  43. sky/server/requests/serializers/encoders.py +7 -3
  44. sky/skylet/constants.py +1 -1
  45. sky/skylet/job_lib.py +2 -32
  46. sky/skylet/log_lib.py +211 -0
  47. sky/skylet/log_lib.pyi +30 -1
  48. sky/skylet/services.py +208 -2
  49. sky/skylet/skylet.py +3 -0
  50. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +32 -32
  51. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +56 -52
  52. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
  53. /sky/dashboard/out/_next/static/{VXU6_xE28M55BOdwmUUJS → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
  54. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
  55. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
  56. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
  57. {skypilot_nightly-1.0.0.dev20250926.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -7,7 +7,7 @@ import urllib.request
7
7
  from sky.utils import directory_utils
8
8
 
9
9
  # Replaced with the current commit when building the wheels.
10
- _SKYPILOT_COMMIT_SHA = '827d534c8bbfa61b895467b9431283e923dd9841'
10
+ _SKYPILOT_COMMIT_SHA = 'e42224b6d29bd960c0e0daa69add0fe2ad695142'
11
11
 
12
12
 
13
13
  def _get_git_commit():
@@ -37,7 +37,7 @@ def _get_git_commit():
37
37
 
38
38
 
39
39
  __commit__ = _get_git_commit()
40
- __version__ = '1.0.0.dev20250926'
40
+ __version__ = '1.0.0.dev20250927'
41
41
  __root_dir__ = directory_utils.get_sky_dir()
42
42
 
43
43
 
@@ -16,8 +16,8 @@ import tempfile
16
16
  import threading
17
17
  import time
18
18
  import typing
19
- from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
20
- TypeVar, Union)
19
+ from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
20
+ Set, Tuple, TypeVar, Union)
21
21
  import uuid
22
22
 
23
23
  import aiohttp
@@ -1226,7 +1226,6 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
1226
1226
  Rather than constructing the whole byte sequence, which may be quite large,
1227
1227
  we construct it incrementally by using hash.update() to add new bytes.
1228
1228
  """
1229
-
1230
1229
  # Load the yaml contents so that we can directly remove keys.
1231
1230
  yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
1232
1231
  for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
@@ -3859,13 +3858,35 @@ def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
3859
3858
  ) from last_exception
3860
3859
 
3861
3860
 
3861
+ def invoke_skylet_streaming_with_retries(
3862
+ stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
3863
+ """Generic helper for making Skylet streaming gRPC requests."""
3864
+ max_attempts = 3
3865
+ backoff = common_utils.Backoff(initial_backoff=0.5)
3866
+ last_exception: Optional[Exception] = None
3867
+
3868
+ for _ in range(max_attempts):
3869
+ try:
3870
+ for response in stream_func():
3871
+ yield response
3872
+ return
3873
+ except grpc.RpcError as e:
3874
+ last_exception = e
3875
+ _handle_grpc_error(e, backoff.current_backoff())
3876
+
3877
+ raise RuntimeError(
3878
+ f'Failed to stream Skylet response after {max_attempts} attempts'
3879
+ ) from last_exception
3880
+
3881
+
3862
3882
  def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
3863
3883
  if e.code() == grpc.StatusCode.INTERNAL:
3864
3884
  with ux_utils.print_exception_no_traceback():
3865
3885
  raise exceptions.SkyletInternalError(e.details())
3866
3886
  elif e.code() == grpc.StatusCode.UNAVAILABLE:
3867
3887
  time.sleep(current_backoff)
3868
- elif e.code() == grpc.StatusCode.UNIMPLEMENTED:
3888
+ elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
3889
+ ) == grpc.StatusCode.UNKNOWN:
3869
3890
  # Handle backwards compatibility: old server doesn't implement this RPC.
3870
3891
  # Let the caller fall back to legacy execution.
3871
3892
  raise exceptions.SkyletMethodNotImplementedError(
@@ -19,8 +19,8 @@ import textwrap
19
19
  import threading
20
20
  import time
21
21
  import typing
22
- from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
23
- Union)
22
+ from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
23
+ Set, Tuple, Union)
24
24
 
25
25
  import colorama
26
26
  import psutil
@@ -91,6 +91,8 @@ if typing.TYPE_CHECKING:
91
91
  from sky.schemas.generated import autostopv1_pb2_grpc
92
92
  from sky.schemas.generated import jobsv1_pb2
93
93
  from sky.schemas.generated import jobsv1_pb2_grpc
94
+ from sky.schemas.generated import managed_jobsv1_pb2
95
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
94
96
  from sky.schemas.generated import servev1_pb2
95
97
  from sky.schemas.generated import servev1_pb2_grpc
96
98
  else:
@@ -111,6 +113,10 @@ else:
111
113
  'sky.schemas.generated.servev1_pb2')
112
114
  servev1_pb2_grpc = adaptors_common.LazyImport(
113
115
  'sky.schemas.generated.servev1_pb2_grpc')
116
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
117
+ 'sky.schemas.generated.managed_jobsv1_pb2')
118
+ managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
119
+ 'sky.schemas.generated.managed_jobsv1_pb2_grpc')
114
120
 
115
121
  Path = str
116
122
 
@@ -2737,6 +2743,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2737
2743
  (tunnel.port, tunnel.pid) if tunnel is not None else None)
2738
2744
 
2739
2745
  def get_grpc_channel(self) -> 'grpc.Channel':
2746
+ grpc_options = [
2747
+ # The task YAMLs can be large, so the default
2748
+ # max_receive_message_length of 4MB might not be enough.
2749
+ ('grpc.max_receive_message_length', -1),
2750
+ ]
2740
2751
  # It's fine to not grab the lock here, as we're only reading,
2741
2752
  # and writes are very rare.
2742
2753
  # It's acceptable to read while another process is opening a tunnel,
@@ -2753,7 +2764,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2753
2764
  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
2754
2765
  s.settimeout(0.5)
2755
2766
  s.connect(('localhost', tunnel.port))
2756
- return grpc.insecure_channel(f'localhost:{tunnel.port}')
2767
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2768
+ options=grpc_options)
2757
2769
  except socket.error as e:
2758
2770
  logger.warning(
2759
2771
  'Failed to connect to SSH tunnel for cluster '
@@ -2772,19 +2784,22 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2772
2784
  f'{self.cluster_name!r}, '
2773
2785
  'opening the tunnel')
2774
2786
  tunnel = self._open_and_update_skylet_tunnel()
2775
- return grpc.insecure_channel(f'localhost:{tunnel.port}')
2787
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2788
+ options=grpc_options)
2776
2789
  try:
2777
2790
  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
2778
2791
  s.settimeout(0.5)
2779
2792
  s.connect(('localhost', tunnel.port))
2780
- return grpc.insecure_channel(f'localhost:{tunnel.port}')
2793
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2794
+ options=grpc_options)
2781
2795
  except socket.error as e:
2782
2796
  logger.warning(
2783
2797
  'Failed to connect to SSH tunnel for cluster '
2784
2798
  f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
2785
2799
  'opening new tunnel')
2786
2800
  tunnel = self._open_and_update_skylet_tunnel()
2787
- return grpc.insecure_channel(f'localhost:{tunnel.port}')
2801
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2802
+ options=grpc_options)
2788
2803
  except locks.LockTimeout as e:
2789
2804
  raise RuntimeError(
2790
2805
  'Failed to get gRPC channel for cluster '
@@ -3060,6 +3075,8 @@ class SkyletClient:
3060
3075
  self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
3061
3076
  self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
3062
3077
  self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
3078
+ self._managed_jobs_stub = (
3079
+ managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
3063
3080
 
3064
3081
  def set_autostop(
3065
3082
  self,
@@ -3146,6 +3163,13 @@ class SkyletClient:
3146
3163
  ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
3147
3164
  return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
3148
3165
 
3166
+ def tail_logs(
3167
+ self,
3168
+ request: 'jobsv1_pb2.TailLogsRequest',
3169
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3170
+ ) -> Iterator['jobsv1_pb2.TailLogsResponse']:
3171
+ return self._jobs_stub.TailLogs(request, timeout=timeout)
3172
+
3149
3173
  def get_service_status(
3150
3174
  self,
3151
3175
  request: 'servev1_pb2.GetServiceStatusRequest',
@@ -3194,6 +3218,35 @@ class SkyletClient:
3194
3218
  ) -> 'servev1_pb2.UpdateServiceResponse':
3195
3219
  return self._serve_stub.UpdateService(request, timeout=timeout)
3196
3220
 
3221
+ def get_managed_job_controller_version(
3222
+ self,
3223
+ request: 'managed_jobsv1_pb2.GetVersionRequest',
3224
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3225
+ ) -> 'managed_jobsv1_pb2.GetVersionResponse':
3226
+ return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
3227
+
3228
+ def get_managed_job_table(
3229
+ self,
3230
+ request: 'managed_jobsv1_pb2.GetJobTableRequest',
3231
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3232
+ ) -> 'managed_jobsv1_pb2.GetJobTableResponse':
3233
+ return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
3234
+
3235
+ def get_all_managed_job_ids_by_name(
3236
+ self,
3237
+ request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
3238
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3239
+ ) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
3240
+ return self._managed_jobs_stub.GetAllJobIdsByName(request,
3241
+ timeout=timeout)
3242
+
3243
+ def cancel_managed_jobs(
3244
+ self,
3245
+ request: 'managed_jobsv1_pb2.CancelJobsRequest',
3246
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3247
+ ) -> 'managed_jobsv1_pb2.CancelJobsResponse':
3248
+ return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
3249
+
3197
3250
 
3198
3251
  @registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
3199
3252
  class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
@@ -3706,7 +3759,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3706
3759
  ux_utils.spinner_message('Preparing SkyPilot runtime')):
3707
3760
  use_legacy = not handle.is_grpc_enabled_with_flag
3708
3761
 
3709
- if handle.is_grpc_enabled_with_flag:
3762
+ if not use_legacy:
3710
3763
  try:
3711
3764
  request = jobsv1_pb2.UpdateStatusRequest()
3712
3765
  backend_utils.invoke_skylet_with_retries(
@@ -3730,7 +3783,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3730
3783
  # 2. On next `sky start`, it gets reset to FAILED.
3731
3784
  use_legacy = not handle.is_grpc_enabled_with_flag
3732
3785
 
3733
- if handle.is_grpc_enabled_with_flag:
3786
+ if not use_legacy:
3734
3787
  try:
3735
3788
  fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
3736
3789
  backend_utils.invoke_skylet_with_retries(
@@ -4165,7 +4218,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4165
4218
  _dump_code_to_file(job_submit_cmd,
4166
4219
  constants.PERSISTENT_RUN_SCRIPT_DIR)
4167
4220
 
4168
- if handle.is_grpc_enabled_with_flag:
4221
+ if not use_legacy:
4169
4222
  try:
4170
4223
  managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
4171
4224
  if managed_job_dag is not None:
@@ -4297,7 +4350,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4297
4350
  metadata: str) -> Tuple[int, str]:
4298
4351
  use_legacy = not handle.is_grpc_enabled_with_flag
4299
4352
 
4300
- if handle.is_grpc_enabled_with_flag:
4353
+ if not use_legacy:
4301
4354
  try:
4302
4355
  request = jobsv1_pb2.AddJobRequest(
4303
4356
  job_name=job_name,
@@ -4567,7 +4620,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4567
4620
  """
4568
4621
  use_legacy = not handle.is_grpc_enabled_with_flag
4569
4622
 
4570
- if handle.is_grpc_enabled_with_flag:
4623
+ if not use_legacy:
4571
4624
  try:
4572
4625
  request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
4573
4626
  cancel_all=cancel_all,
@@ -4610,7 +4663,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4610
4663
  job_to_dir: Dict[str, str] = {}
4611
4664
  use_legacy = not handle.is_grpc_enabled_with_flag
4612
4665
 
4613
- if handle.is_grpc_enabled_with_flag:
4666
+ if not use_legacy:
4614
4667
  try:
4615
4668
  int_job_ids = []
4616
4669
  if job_ids:
@@ -4724,6 +4777,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4724
4777
  The exit code of the tail command. Returns code 100 if the job has
4725
4778
  failed. See exceptions.JobExitCode for possible return codes.
4726
4779
  """
4780
+ if handle.is_grpc_enabled_with_flag:
4781
+ last_exit_code = 0
4782
+ try:
4783
+ request = jobsv1_pb2.TailLogsRequest(
4784
+ job_id=job_id,
4785
+ managed_job_id=managed_job_id,
4786
+ follow=follow,
4787
+ tail=tail)
4788
+ for resp in backend_utils.invoke_skylet_streaming_with_retries(
4789
+ lambda: SkyletClient(handle.get_grpc_channel()
4790
+ ).tail_logs(request, timeout=None)):
4791
+ if resp.log_line:
4792
+ print(resp.log_line, end='', flush=True)
4793
+ last_exit_code = resp.exit_code
4794
+ return last_exit_code
4795
+ except exceptions.SkyletMethodNotImplementedError:
4796
+ pass
4797
+ except grpc.RpcError as e:
4798
+ if e.code() == grpc.StatusCode.CANCELLED:
4799
+ return last_exit_code
4800
+ raise e
4801
+
4727
4802
  code = job_lib.JobLibCodeGen.tail_logs(job_id,
4728
4803
  managed_job_id=managed_job_id,
4729
4804
  follow=follow,
@@ -4761,6 +4836,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4761
4836
  tail: Optional[int] = None) -> int:
4762
4837
  # if job_name is not None, job_id should be None
4763
4838
  assert job_name is None or job_id is None, (job_name, job_id)
4839
+ # TODO(kevin): Migrate stream_logs to gRPC
4764
4840
  code = managed_jobs.ManagedJobCodeGen.stream_logs(
4765
4841
  job_name, job_id, follow, controller, tail)
4766
4842
 
@@ -4806,20 +4882,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4806
4882
  assert job_name is None or job_id is None, (job_name, job_id)
4807
4883
 
4808
4884
  if job_id is None:
4809
- # generate code to get the job_id
4885
+ # get the job_id
4810
4886
  # if job_name is None, get all job_ids
4811
4887
  # TODO: Only get the latest job_id, since that's the only one we use
4812
- code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
4813
- job_name=job_name)
4814
- returncode, job_ids, stderr = self.run_on_head(handle,
4815
- code,
4816
- stream_logs=False,
4817
- require_outputs=True,
4818
- separate_stderr=True)
4819
- subprocess_utils.handle_returncode(returncode, code,
4820
- 'Failed to sync down logs.',
4821
- stderr)
4822
- job_ids = message_utils.decode_payload(job_ids)
4888
+
4889
+ use_legacy = not handle.is_grpc_enabled_with_flag
4890
+ logger.info(f'handle.is_grpc_enabled_with_flag: '
4891
+ f'{handle.is_grpc_enabled_with_flag}')
4892
+ if not use_legacy:
4893
+ try:
4894
+ request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
4895
+ job_name=job_name)
4896
+ response = backend_utils.invoke_skylet_with_retries(
4897
+ lambda: SkyletClient(handle.get_grpc_channel(
4898
+ )).get_all_managed_job_ids_by_name(request))
4899
+ job_ids = list(response.job_ids)
4900
+ except exceptions.SkyletMethodNotImplementedError:
4901
+ use_legacy = True
4902
+
4903
+ if use_legacy:
4904
+ code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
4905
+ job_name=job_name)
4906
+ returncode, job_ids_payload, stderr = self.run_on_head(
4907
+ handle,
4908
+ code,
4909
+ stream_logs=False,
4910
+ require_outputs=True,
4911
+ separate_stderr=True)
4912
+ subprocess_utils.handle_returncode(returncode, code,
4913
+ 'Failed to sync down logs.',
4914
+ stderr)
4915
+ job_ids = message_utils.decode_payload(job_ids_payload)
4823
4916
  if not job_ids:
4824
4917
  logger.info(f'{colorama.Fore.YELLOW}'
4825
4918
  'No matching job found'
@@ -4847,18 +4940,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4847
4940
  else:
4848
4941
  # get the run_timestamp
4849
4942
  # the function takes in [job_id]
4850
- code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs([str(job_id)])
4851
- returncode, run_timestamps_payload, stderr = self.run_on_head(
4852
- handle,
4853
- code,
4854
- stream_logs=False,
4855
- require_outputs=True,
4856
- separate_stderr=True)
4857
- subprocess_utils.handle_returncode(returncode, code,
4858
- 'Failed to sync logs.', stderr)
4859
- # returns with a dict of {job_id: run_timestamp}
4860
- run_timestamps = message_utils.decode_payload(
4861
- run_timestamps_payload)
4943
+ use_legacy = not handle.is_grpc_enabled_with_flag
4944
+ if not use_legacy:
4945
+ try:
4946
+ log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
4947
+ job_ids=[job_id])
4948
+ log_dirs_response = (
4949
+ backend_utils.invoke_skylet_with_retries(
4950
+ lambda: SkyletClient(handle.get_grpc_channel(
4951
+ )).get_log_dirs_for_jobs(log_dirs_request)))
4952
+ job_log_dirs = log_dirs_response.job_log_dirs
4953
+ # Convert back to the expected format
4954
+ # {job_id: run_timestamp}
4955
+ run_timestamps = {}
4956
+ for jid, log_dir in job_log_dirs.items():
4957
+ run_timestamps[int(jid)] = log_dir
4958
+ except exceptions.SkyletMethodNotImplementedError:
4959
+ use_legacy = True
4960
+
4961
+ if use_legacy:
4962
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
4963
+ [str(job_id)])
4964
+ returncode, run_timestamps_payload, stderr = self.run_on_head(
4965
+ handle,
4966
+ code,
4967
+ stream_logs=False,
4968
+ require_outputs=True,
4969
+ separate_stderr=True)
4970
+ subprocess_utils.handle_returncode(returncode, code,
4971
+ 'Failed to sync logs.',
4972
+ stderr)
4973
+ # returns with a dict of {job_id: run_timestamp}
4974
+ run_timestamps = message_utils.decode_payload(
4975
+ run_timestamps_payload)
4862
4976
  if not run_timestamps:
4863
4977
  logger.info(f'{colorama.Fore.YELLOW}'
4864
4978
  'No matching log directories found'
@@ -4925,6 +5039,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4925
5039
  exist_ok=True)
4926
5040
  log_file = os.path.join(local_log_dir, 'run.log')
4927
5041
 
5042
+ # TODO(kevin): Migrate stream_logs to gRPC
4928
5043
  code = managed_jobs.ManagedJobCodeGen.stream_logs(
4929
5044
  job_name=None,
4930
5045
  job_id=int(job_id),
sky/client/cli/command.py CHANGED
@@ -59,6 +59,7 @@ from sky import task as task_lib
59
59
  from sky.adaptors import common as adaptors_common
60
60
  from sky.client import sdk
61
61
  from sky.client.cli import flags
62
+ from sky.client.cli import table_utils
62
63
  from sky.data import storage_utils
63
64
  from sky.provision.kubernetes import constants as kubernetes_constants
64
65
  from sky.provision.kubernetes import utils as kubernetes_utils
@@ -2125,7 +2126,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
2125
2126
  f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
2126
2127
  f' {common_utils.format_exception(e)}')
2127
2128
  return
2128
- job_tables[cluster] = job_lib.format_job_queue(job_table)
2129
+ job_tables[cluster] = table_utils.format_job_queue(job_table)
2129
2130
 
2130
2131
  subprocess_utils.run_in_parallel(_get_job_queue, clusters)
2131
2132
  user_str = 'all users' if all_users else 'current user'
@@ -0,0 +1,34 @@
1
+ """Utilities for formatting tables for CLI output."""
2
+ from typing import List
3
+
4
+ from sky.schemas.api import responses
5
+ from sky.utils import log_utils
6
+
7
+
8
+ def format_job_queue(jobs: List[responses.ClusterJobRecord]):
9
+ """Format the job queue for display.
10
+
11
+ Usage:
12
+ jobs = get_job_queue()
13
+ print(format_job_queue(jobs))
14
+ """
15
+ job_table = log_utils.create_table([
16
+ 'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
17
+ 'STATUS', 'LOG', 'GIT COMMIT'
18
+ ])
19
+ for job in jobs:
20
+ job_table.add_row([
21
+ job.job_id,
22
+ job.job_name,
23
+ job.username,
24
+ log_utils.readable_time_duration(job.submitted_at),
25
+ log_utils.readable_time_duration(job.start_at),
26
+ log_utils.readable_time_duration(job.start_at,
27
+ job.end_at,
28
+ absolute=True),
29
+ job.resources,
30
+ job.status.colored_str(),
31
+ job.log_path,
32
+ job.metadata.get('git_commit', '-'),
33
+ ])
34
+ return job_table
sky/client/sdk.py CHANGED
@@ -1267,9 +1267,11 @@ def autostop(
1267
1267
  @usage_lib.entrypoint
1268
1268
  @server_common.check_server_healthy_or_start
1269
1269
  @annotations.client_api
1270
- def queue(cluster_name: str,
1271
- skip_finished: bool = False,
1272
- all_users: bool = False) -> server_common.RequestId[List[dict]]:
1270
+ def queue(
1271
+ cluster_name: str,
1272
+ skip_finished: bool = False,
1273
+ all_users: bool = False
1274
+ ) -> server_common.RequestId[List[responses.ClusterJobRecord]]:
1273
1275
  """Gets the job queue of a cluster.
1274
1276
 
1275
1277
  Args:
@@ -1282,8 +1284,8 @@ def queue(cluster_name: str,
1282
1284
  The request ID of the queue request.
1283
1285
 
1284
1286
  Request Returns:
1285
- job_records (List[Dict[str, Any]]): A list of dicts for each job in the
1286
- queue.
1287
+ job_records (List[responses.ClusterJobRecord]): A list of job records
1288
+ for each job in the queue.
1287
1289
 
1288
1290
  .. code-block:: python
1289
1291
 
sky/client/sdk_async.py CHANGED
@@ -523,11 +523,11 @@ async def autostop(
523
523
  @usage_lib.entrypoint
524
524
  @annotations.client_api
525
525
  async def queue(
526
- cluster_name: str,
527
- skip_finished: bool = False,
528
- all_users: bool = False,
529
- stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
530
- ) -> List[dict]:
526
+ cluster_name: str,
527
+ skip_finished: bool = False,
528
+ all_users: bool = False,
529
+ stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
530
+ ) -> List[responses.ClusterJobRecord]:
531
531
  """Async version of queue() that gets the job queue of a cluster."""
532
532
  request_id = await context_utils.to_thread(sdk.queue, cluster_name,
533
533
  skip_finished, all_users)
sky/core.py CHANGED
@@ -803,7 +803,7 @@ def autostop(
803
803
  @usage_lib.entrypoint
804
804
  def queue(cluster_name: str,
805
805
  skip_finished: bool = False,
806
- all_users: bool = False) -> List[dict]:
806
+ all_users: bool = False) -> List[responses.ClusterJobRecord]:
807
807
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
808
808
  """Gets the job queue of a cluster.
809
809
 
@@ -850,7 +850,7 @@ def queue(cluster_name: str,
850
850
 
851
851
  use_legacy = not handle.is_grpc_enabled_with_flag
852
852
 
853
- if handle.is_grpc_enabled_with_flag:
853
+ if not use_legacy:
854
854
  try:
855
855
  request = jobsv1_pb2.GetJobQueueRequest(user_hash=user_hash,
856
856
  all_jobs=all_jobs)
@@ -879,7 +879,6 @@ def queue(cluster_name: str,
879
879
  jobs.append(job_dict)
880
880
  except exceptions.SkyletMethodNotImplementedError:
881
881
  use_legacy = True
882
-
883
882
  if use_legacy:
884
883
  code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
885
884
  returncode, jobs_payload, stderr = backend.run_on_head(
@@ -891,7 +890,7 @@ def queue(cluster_name: str,
891
890
  stderr=f'{jobs_payload + stderr}',
892
891
  stream_logs=True)
893
892
  jobs = job_lib.load_job_queue(jobs_payload)
894
- return jobs
893
+ return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
895
894
 
896
895
 
897
896
  @usage_lib.entrypoint
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-8e64d11e58eab5cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/VXU6_xE28M55BOdwmUUJS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"VXU6_xE28M55BOdwmUUJS","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-7340bc0f0dd8ae74.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js" defer=""></script><script src="/dashboard/_next/static/UDSEoDB67vwFMZyCJ4HWU/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/UDSEoDB67vwFMZyCJ4HWU/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"UDSEoDB67vwFMZyCJ4HWU","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- self.__BUILD_MANIFEST=function(s,c,a,t,e,f,u,n,b,o,j,i,r,k){return{__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/":["static/chunks/pages/index-444f1804401f04ea.js"],"/_error":["static/chunks/pages/_error-c66a4e8afc46f17b.js"],"/clusters":["static/chunks/pages/clusters-469814d711d63b1b.js"],"/clusters/[cluster]":[s,c,a,f,u,"static/chunks/4676-9da7fdbde90b5549.js",o,t,e,n,j,b,i,"static/chunks/6856-2b3600ff2854d066.js",r,k,"static/chunks/9037-d0c00018a5ba198c.js","static/chunks/pages/clusters/[cluster]-e052384df65ef200.js"],"/clusters/[cluster]/[job]":[s,c,a,f,t,e,b,"static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js"],"/config":["static/chunks/pages/config-dfb9bf07b13045f4.js"],"/infra":["static/chunks/pages/infra-aabba60d57826e0f.js"],"/infra/[context]":["static/chunks/pages/infra/[context]-6563820e094f68ca.js"],"/jobs":["static/chunks/pages/jobs-1f70d9faa564804f.js"],"/jobs/pools/[pool]":[s,c,a,u,o,t,e,n,"static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js"],"/jobs/[job]":[s,c,a,f,u,o,t,e,n,b,"static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js"],"/users":["static/chunks/pages/users-018bf31cda52e11b.js"],"/volumes":["static/chunks/pages/volumes-739726d6b823f532.js"],"/workspace/new":["static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js"],"/workspaces":["static/chunks/pages/workspaces-7528cc0ef8c522c5.js"],"/workspaces/[name]":[s,c,a,f,u,"static/chunks/1836-37fede578e2da5f8.js",t,e,n,j,b,i,r,k,"static/chunks/1141-159df2d4c441a9d1.js","static/chunks/pages/workspaces/[name]-af76bb06dbb3954f.js"],sortedPages:["/","/_app","/_error","/clusters","/clusters/[cluster]","/clusters/[cluster]/[job]","/config","/infra","/infra/[context]","/jobs","/jobs/pools/[pool]","/jobs/[job]","/users","/volumes","/workspace/new","/workspaces","/workspaces/[name]"]}}("static/chunks/616-3d59f75e2ccf9321.js","static/chunks/6130-2be46d70a38f1e82.js","static/chunks/5739-d67458fcb1386c92.js","static/chunks/6989-01359c57e018caa4.js","static/chunks/3850-ff4a9a69d978632b.js","static/chunks/7411-b15471acd2cba716.js","static/chunks/1272-1ef0bf0237faccdb.js","static/chunks/8969-d8bc3a2b9cf839a9.js","static/chunks/6135-4b4d5e824b7f9d3c.js","static/chunks/754-d0da8ab45f9509e9.js","static/chunks/6990-f6818c84ed8f1c86.js","static/chunks/1121-d0782b9251f0fcd3.js","static/chunks/6601-06114c982db410b6.js","static/chunks/3015-88c7c8d69b0b6dba.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
1
+ self.__BUILD_MANIFEST=function(s,c,a,t,e,f,u,n,b,o,j,i,r,d){return{__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/":["static/chunks/pages/index-444f1804401f04ea.js"],"/_error":["static/chunks/pages/_error-c66a4e8afc46f17b.js"],"/clusters":["static/chunks/pages/clusters-469814d711d63b1b.js"],"/clusters/[cluster]":[s,c,a,f,u,"static/chunks/4676-9da7fdbde90b5549.js",o,t,e,n,j,b,i,"static/chunks/6856-5fdc9b851a18acdb.js",r,d,"static/chunks/9037-d0c00018a5ba198c.js","static/chunks/pages/clusters/[cluster]-e052384df65ef200.js"],"/clusters/[cluster]/[job]":[s,c,a,f,t,e,b,"static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js"],"/config":["static/chunks/pages/config-dfb9bf07b13045f4.js"],"/infra":["static/chunks/pages/infra-aabba60d57826e0f.js"],"/infra/[context]":["static/chunks/pages/infra/[context]-6563820e094f68ca.js"],"/jobs":["static/chunks/pages/jobs-1f70d9faa564804f.js"],"/jobs/pools/[pool]":[s,c,a,u,o,t,e,n,"static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js"],"/jobs/[job]":[s,c,a,f,u,o,t,e,n,b,"static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js"],"/users":["static/chunks/pages/users-018bf31cda52e11b.js"],"/volumes":["static/chunks/pages/volumes-739726d6b823f532.js"],"/workspace/new":["static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js"],"/workspaces":["static/chunks/pages/workspaces-7528cc0ef8c522c5.js"],"/workspaces/[name]":[s,c,a,f,u,"static/chunks/1836-37fede578e2da5f8.js",t,e,n,j,b,i,r,d,"static/chunks/1141-159df2d4c441a9d1.js","static/chunks/pages/workspaces/[name]-af76bb06dbb3954f.js"],sortedPages:["/","/_app","/_error","/clusters","/clusters/[cluster]","/clusters/[cluster]/[job]","/config","/infra","/infra/[context]","/jobs","/jobs/pools/[pool]","/jobs/[job]","/users","/volumes","/workspace/new","/workspaces","/workspaces/[name]"]}}("static/chunks/616-3d59f75e2ccf9321.js","static/chunks/6130-2be46d70a38f1e82.js","static/chunks/5739-d67458fcb1386c92.js","static/chunks/6989-01359c57e018caa4.js","static/chunks/3850-ff4a9a69d978632b.js","static/chunks/7411-b15471acd2cba716.js","static/chunks/1272-1ef0bf0237faccdb.js","static/chunks/8969-d8bc3a2b9cf839a9.js","static/chunks/6135-4b4d5e824b7f9d3c.js","static/chunks/754-d0da8ab45f9509e9.js","static/chunks/6990-f6818c84ed8f1c86.js","static/chunks/1121-d0782b9251f0fcd3.js","static/chunks/6601-06114c982db410b6.js","static/chunks/3015-88c7c8d69b0b6dba.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();