skypilot-nightly 1.0.0.dev20250925__py3-none-any.whl → 1.0.0.dev20250927__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (78) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +38 -14
  3. sky/backends/cloud_vm_ray_backend.py +151 -36
  4. sky/client/cli/command.py +18 -9
  5. sky/client/cli/table_utils.py +34 -0
  6. sky/client/common.py +4 -2
  7. sky/client/sdk.py +11 -7
  8. sky/client/sdk_async.py +5 -5
  9. sky/core.py +6 -6
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{3294.03e02ae73455f48e.js → 3294.93d9336bdc032b3a.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
  16. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
  17. sky/dashboard/out/_next/static/chunks/{webpack-16ba1d7187d2e3b1.js → webpack-7340bc0f0dd8ae74.js} +1 -1
  18. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  19. sky/dashboard/out/clusters/[cluster].html +1 -1
  20. sky/dashboard/out/clusters.html +1 -1
  21. sky/dashboard/out/config.html +1 -1
  22. sky/dashboard/out/index.html +1 -1
  23. sky/dashboard/out/infra/[context].html +1 -1
  24. sky/dashboard/out/infra.html +1 -1
  25. sky/dashboard/out/jobs/[job].html +1 -1
  26. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/volumes.html +1 -1
  30. sky/dashboard/out/workspace/new.html +1 -1
  31. sky/dashboard/out/workspaces/[name].html +1 -1
  32. sky/dashboard/out/workspaces.html +1 -1
  33. sky/execution.py +0 -1
  34. sky/global_user_state.py +57 -34
  35. sky/jobs/constants.py +2 -0
  36. sky/jobs/controller.py +4 -0
  37. sky/jobs/server/core.py +98 -26
  38. sky/jobs/server/utils.py +65 -32
  39. sky/jobs/state.py +145 -3
  40. sky/jobs/utils.py +85 -7
  41. sky/provision/runpod/__init__.py +2 -0
  42. sky/schemas/api/responses.py +18 -0
  43. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  44. sky/schemas/generated/managed_jobsv1_pb2.py +70 -0
  45. sky/schemas/generated/managed_jobsv1_pb2.pyi +262 -0
  46. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  47. sky/serve/serve_utils.py +16 -0
  48. sky/serve/server/core.py +1 -1
  49. sky/serve/server/impl.py +6 -6
  50. sky/server/requests/payloads.py +2 -1
  51. sky/server/requests/serializers/decoders.py +2 -2
  52. sky/server/requests/serializers/encoders.py +7 -3
  53. sky/setup_files/dependencies.py +1 -1
  54. sky/skylet/constants.py +4 -1
  55. sky/skylet/events.py +42 -0
  56. sky/skylet/job_lib.py +2 -32
  57. sky/skylet/log_lib.py +211 -0
  58. sky/skylet/log_lib.pyi +30 -1
  59. sky/skylet/services.py +208 -2
  60. sky/skylet/skylet.py +3 -0
  61. sky/templates/jobs-controller.yaml.j2 +3 -0
  62. sky/templates/kubernetes-ray.yml.j2 +8 -3
  63. sky/utils/db/db_utils.py +5 -1
  64. sky/utils/db/migration_utils.py +1 -1
  65. sky/utils/kubernetes/kubernetes_deploy_utils.py +35 -12
  66. sky/volumes/server/core.py +1 -0
  67. sky/volumes/volume.py +16 -17
  68. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/METADATA +36 -36
  69. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/RECORD +74 -69
  70. sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +0 -1
  71. sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +0 -1
  72. sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
  73. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +0 -16
  74. /sky/dashboard/out/_next/static/{bn-NHt5qTzeTN2PefXuDA → UDSEoDB67vwFMZyCJ4HWU}/_ssgManifest.js +0 -0
  75. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/WHEEL +0 -0
  76. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/entry_points.txt +0 -0
  77. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/licenses/LICENSE +0 -0
  78. {skypilot_nightly-1.0.0.dev20250925.dist-info → skypilot_nightly-1.0.0.dev20250927.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -7,7 +7,7 @@ import urllib.request
7
7
  from sky.utils import directory_utils
8
8
 
9
9
  # Replaced with the current commit when building the wheels.
10
- _SKYPILOT_COMMIT_SHA = 'c5a7c4995b9a92ce1c005ad783d2725c7f7f9af2'
10
+ _SKYPILOT_COMMIT_SHA = 'e42224b6d29bd960c0e0daa69add0fe2ad695142'
11
11
 
12
12
 
13
13
  def _get_git_commit():
@@ -37,7 +37,7 @@ def _get_git_commit():
37
37
 
38
38
 
39
39
  __commit__ = _get_git_commit()
40
- __version__ = '1.0.0.dev20250925'
40
+ __version__ = '1.0.0.dev20250927'
41
41
  __root_dir__ = directory_utils.get_sky_dir()
42
42
 
43
43
 
@@ -16,8 +16,8 @@ import tempfile
16
16
  import threading
17
17
  import time
18
18
  import typing
19
- from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
20
- TypeVar, Union)
19
+ from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
20
+ Set, Tuple, TypeVar, Union)
21
21
  import uuid
22
22
 
23
23
  import aiohttp
@@ -797,7 +797,7 @@ def write_cluster_config(
797
797
  cloud=str(cloud).lower(),
798
798
  region=region.name,
799
799
  keys=('use_ssm',),
800
- default_value=False)
800
+ default_value=None)
801
801
 
802
802
  if use_ssm and ssh_proxy_command is not None:
803
803
  raise exceptions.InvalidCloudConfigs(
@@ -805,15 +805,18 @@ def write_cluster_config(
805
805
  f'is already set to {ssh_proxy_command!r}. Please remove '
806
806
  'ssh_proxy_command or set use_ssm to false.')
807
807
 
808
- if not use_ssm and use_internal_ips and ssh_proxy_command is None:
809
- logger.warning(
810
- f'{colorama.Fore.YELLOW}'
811
- 'use_internal_ips is set to true, '
812
- 'but ssh_proxy_command is not set. Defaulting to '
813
- 'using SSM. Specify ssh_proxy_command to use a different '
814
- 'https://docs.skypilot.co/en/latest/reference/config.html#'
815
- f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
816
- use_ssm = True
808
+ if use_internal_ips and ssh_proxy_command is None:
809
+ # Only if use_ssm is explicitly not set, we default to using SSM.
810
+ if use_ssm is None:
811
+ logger.warning(
812
+ f'{colorama.Fore.YELLOW}'
813
+ 'use_internal_ips is set to true, '
814
+ 'but ssh_proxy_command is not set. Defaulting to '
815
+ 'using SSM. Specify ssh_proxy_command to use a different '
816
+ 'https://docs.skypilot.co/en/latest/reference/config.html#'
817
+ f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
818
+ use_ssm = True
819
+
817
820
  if use_ssm:
818
821
  aws_profile = os.environ.get('AWS_PROFILE', None)
819
822
  profile_str = f'--profile {aws_profile}' if aws_profile else ''
@@ -1223,7 +1226,6 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
1223
1226
  Rather than constructing the whole byte sequence, which may be quite large,
1224
1227
  we construct it incrementally by using hash.update() to add new bytes.
1225
1228
  """
1226
-
1227
1229
  # Load the yaml contents so that we can directly remove keys.
1228
1230
  yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
1229
1231
  for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
@@ -3856,13 +3858,35 @@ def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
3856
3858
  ) from last_exception
3857
3859
 
3858
3860
 
3861
+ def invoke_skylet_streaming_with_retries(
3862
+ stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
3863
+ """Generic helper for making Skylet streaming gRPC requests."""
3864
+ max_attempts = 3
3865
+ backoff = common_utils.Backoff(initial_backoff=0.5)
3866
+ last_exception: Optional[Exception] = None
3867
+
3868
+ for _ in range(max_attempts):
3869
+ try:
3870
+ for response in stream_func():
3871
+ yield response
3872
+ return
3873
+ except grpc.RpcError as e:
3874
+ last_exception = e
3875
+ _handle_grpc_error(e, backoff.current_backoff())
3876
+
3877
+ raise RuntimeError(
3878
+ f'Failed to stream Skylet response after {max_attempts} attempts'
3879
+ ) from last_exception
3880
+
3881
+
3859
3882
  def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
3860
3883
  if e.code() == grpc.StatusCode.INTERNAL:
3861
3884
  with ux_utils.print_exception_no_traceback():
3862
3885
  raise exceptions.SkyletInternalError(e.details())
3863
3886
  elif e.code() == grpc.StatusCode.UNAVAILABLE:
3864
3887
  time.sleep(current_backoff)
3865
- elif e.code() == grpc.StatusCode.UNIMPLEMENTED:
3888
+ elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
3889
+ ) == grpc.StatusCode.UNKNOWN:
3866
3890
  # Handle backwards compatibility: old server doesn't implement this RPC.
3867
3891
  # Let the caller fall back to legacy execution.
3868
3892
  raise exceptions.SkyletMethodNotImplementedError(
@@ -19,8 +19,8 @@ import textwrap
19
19
  import threading
20
20
  import time
21
21
  import typing
22
- from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
23
- Union)
22
+ from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
23
+ Set, Tuple, Union)
24
24
 
25
25
  import colorama
26
26
  import psutil
@@ -91,6 +91,8 @@ if typing.TYPE_CHECKING:
91
91
  from sky.schemas.generated import autostopv1_pb2_grpc
92
92
  from sky.schemas.generated import jobsv1_pb2
93
93
  from sky.schemas.generated import jobsv1_pb2_grpc
94
+ from sky.schemas.generated import managed_jobsv1_pb2
95
+ from sky.schemas.generated import managed_jobsv1_pb2_grpc
94
96
  from sky.schemas.generated import servev1_pb2
95
97
  from sky.schemas.generated import servev1_pb2_grpc
96
98
  else:
@@ -111,6 +113,10 @@ else:
111
113
  'sky.schemas.generated.servev1_pb2')
112
114
  servev1_pb2_grpc = adaptors_common.LazyImport(
113
115
  'sky.schemas.generated.servev1_pb2_grpc')
116
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
117
+ 'sky.schemas.generated.managed_jobsv1_pb2')
118
+ managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
119
+ 'sky.schemas.generated.managed_jobsv1_pb2_grpc')
114
120
 
115
121
  Path = str
116
122
 
@@ -2737,6 +2743,11 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2737
2743
  (tunnel.port, tunnel.pid) if tunnel is not None else None)
2738
2744
 
2739
2745
  def get_grpc_channel(self) -> 'grpc.Channel':
2746
+ grpc_options = [
2747
+ # The task YAMLs can be large, so the default
2748
+ # max_receive_message_length of 4MB might not be enough.
2749
+ ('grpc.max_receive_message_length', -1),
2750
+ ]
2740
2751
  # It's fine to not grab the lock here, as we're only reading,
2741
2752
  # and writes are very rare.
2742
2753
  # It's acceptable to read while another process is opening a tunnel,
@@ -2753,7 +2764,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2753
2764
  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
2754
2765
  s.settimeout(0.5)
2755
2766
  s.connect(('localhost', tunnel.port))
2756
- return grpc.insecure_channel(f'localhost:{tunnel.port}')
2767
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2768
+ options=grpc_options)
2757
2769
  except socket.error as e:
2758
2770
  logger.warning(
2759
2771
  'Failed to connect to SSH tunnel for cluster '
@@ -2772,19 +2784,22 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2772
2784
  f'{self.cluster_name!r}, '
2773
2785
  'opening the tunnel')
2774
2786
  tunnel = self._open_and_update_skylet_tunnel()
2775
- return grpc.insecure_channel(f'localhost:{tunnel.port}')
2787
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2788
+ options=grpc_options)
2776
2789
  try:
2777
2790
  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
2778
2791
  s.settimeout(0.5)
2779
2792
  s.connect(('localhost', tunnel.port))
2780
- return grpc.insecure_channel(f'localhost:{tunnel.port}')
2793
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2794
+ options=grpc_options)
2781
2795
  except socket.error as e:
2782
2796
  logger.warning(
2783
2797
  'Failed to connect to SSH tunnel for cluster '
2784
2798
  f'{self.cluster_name!r} on port {tunnel.port} ({e}), '
2785
2799
  'opening new tunnel')
2786
2800
  tunnel = self._open_and_update_skylet_tunnel()
2787
- return grpc.insecure_channel(f'localhost:{tunnel.port}')
2801
+ return grpc.insecure_channel(f'localhost:{tunnel.port}',
2802
+ options=grpc_options)
2788
2803
  except locks.LockTimeout as e:
2789
2804
  raise RuntimeError(
2790
2805
  'Failed to get gRPC channel for cluster '
@@ -3060,6 +3075,8 @@ class SkyletClient:
3060
3075
  self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
3061
3076
  self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
3062
3077
  self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
3078
+ self._managed_jobs_stub = (
3079
+ managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
3063
3080
 
3064
3081
  def set_autostop(
3065
3082
  self,
@@ -3146,6 +3163,13 @@ class SkyletClient:
3146
3163
  ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
3147
3164
  return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
3148
3165
 
3166
+ def tail_logs(
3167
+ self,
3168
+ request: 'jobsv1_pb2.TailLogsRequest',
3169
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3170
+ ) -> Iterator['jobsv1_pb2.TailLogsResponse']:
3171
+ return self._jobs_stub.TailLogs(request, timeout=timeout)
3172
+
3149
3173
  def get_service_status(
3150
3174
  self,
3151
3175
  request: 'servev1_pb2.GetServiceStatusRequest',
@@ -3194,6 +3218,35 @@ class SkyletClient:
3194
3218
  ) -> 'servev1_pb2.UpdateServiceResponse':
3195
3219
  return self._serve_stub.UpdateService(request, timeout=timeout)
3196
3220
 
3221
+ def get_managed_job_controller_version(
3222
+ self,
3223
+ request: 'managed_jobsv1_pb2.GetVersionRequest',
3224
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3225
+ ) -> 'managed_jobsv1_pb2.GetVersionResponse':
3226
+ return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
3227
+
3228
+ def get_managed_job_table(
3229
+ self,
3230
+ request: 'managed_jobsv1_pb2.GetJobTableRequest',
3231
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3232
+ ) -> 'managed_jobsv1_pb2.GetJobTableResponse':
3233
+ return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
3234
+
3235
+ def get_all_managed_job_ids_by_name(
3236
+ self,
3237
+ request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
3238
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3239
+ ) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
3240
+ return self._managed_jobs_stub.GetAllJobIdsByName(request,
3241
+ timeout=timeout)
3242
+
3243
+ def cancel_managed_jobs(
3244
+ self,
3245
+ request: 'managed_jobsv1_pb2.CancelJobsRequest',
3246
+ timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
3247
+ ) -> 'managed_jobsv1_pb2.CancelJobsResponse':
3248
+ return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
3249
+
3197
3250
 
3198
3251
  @registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
3199
3252
  class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
@@ -3706,7 +3759,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3706
3759
  ux_utils.spinner_message('Preparing SkyPilot runtime')):
3707
3760
  use_legacy = not handle.is_grpc_enabled_with_flag
3708
3761
 
3709
- if handle.is_grpc_enabled_with_flag:
3762
+ if not use_legacy:
3710
3763
  try:
3711
3764
  request = jobsv1_pb2.UpdateStatusRequest()
3712
3765
  backend_utils.invoke_skylet_with_retries(
@@ -3730,7 +3783,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3730
3783
  # 2. On next `sky start`, it gets reset to FAILED.
3731
3784
  use_legacy = not handle.is_grpc_enabled_with_flag
3732
3785
 
3733
- if handle.is_grpc_enabled_with_flag:
3786
+ if not use_legacy:
3734
3787
  try:
3735
3788
  fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
3736
3789
  backend_utils.invoke_skylet_with_retries(
@@ -4165,7 +4218,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4165
4218
  _dump_code_to_file(job_submit_cmd,
4166
4219
  constants.PERSISTENT_RUN_SCRIPT_DIR)
4167
4220
 
4168
- if handle.is_grpc_enabled_with_flag:
4221
+ if not use_legacy:
4169
4222
  try:
4170
4223
  managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
4171
4224
  if managed_job_dag is not None:
@@ -4297,7 +4350,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4297
4350
  metadata: str) -> Tuple[int, str]:
4298
4351
  use_legacy = not handle.is_grpc_enabled_with_flag
4299
4352
 
4300
- if handle.is_grpc_enabled_with_flag:
4353
+ if not use_legacy:
4301
4354
  try:
4302
4355
  request = jobsv1_pb2.AddJobRequest(
4303
4356
  job_name=job_name,
@@ -4567,7 +4620,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4567
4620
  """
4568
4621
  use_legacy = not handle.is_grpc_enabled_with_flag
4569
4622
 
4570
- if handle.is_grpc_enabled_with_flag:
4623
+ if not use_legacy:
4571
4624
  try:
4572
4625
  request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
4573
4626
  cancel_all=cancel_all,
@@ -4610,7 +4663,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4610
4663
  job_to_dir: Dict[str, str] = {}
4611
4664
  use_legacy = not handle.is_grpc_enabled_with_flag
4612
4665
 
4613
- if handle.is_grpc_enabled_with_flag:
4666
+ if not use_legacy:
4614
4667
  try:
4615
4668
  int_job_ids = []
4616
4669
  if job_ids:
@@ -4724,6 +4777,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4724
4777
  The exit code of the tail command. Returns code 100 if the job has
4725
4778
  failed. See exceptions.JobExitCode for possible return codes.
4726
4779
  """
4780
+ if handle.is_grpc_enabled_with_flag:
4781
+ last_exit_code = 0
4782
+ try:
4783
+ request = jobsv1_pb2.TailLogsRequest(
4784
+ job_id=job_id,
4785
+ managed_job_id=managed_job_id,
4786
+ follow=follow,
4787
+ tail=tail)
4788
+ for resp in backend_utils.invoke_skylet_streaming_with_retries(
4789
+ lambda: SkyletClient(handle.get_grpc_channel()
4790
+ ).tail_logs(request, timeout=None)):
4791
+ if resp.log_line:
4792
+ print(resp.log_line, end='', flush=True)
4793
+ last_exit_code = resp.exit_code
4794
+ return last_exit_code
4795
+ except exceptions.SkyletMethodNotImplementedError:
4796
+ pass
4797
+ except grpc.RpcError as e:
4798
+ if e.code() == grpc.StatusCode.CANCELLED:
4799
+ return last_exit_code
4800
+ raise e
4801
+
4727
4802
  code = job_lib.JobLibCodeGen.tail_logs(job_id,
4728
4803
  managed_job_id=managed_job_id,
4729
4804
  follow=follow,
@@ -4761,6 +4836,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4761
4836
  tail: Optional[int] = None) -> int:
4762
4837
  # if job_name is not None, job_id should be None
4763
4838
  assert job_name is None or job_id is None, (job_name, job_id)
4839
+ # TODO(kevin): Migrate stream_logs to gRPC
4764
4840
  code = managed_jobs.ManagedJobCodeGen.stream_logs(
4765
4841
  job_name, job_id, follow, controller, tail)
4766
4842
 
@@ -4806,20 +4882,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4806
4882
  assert job_name is None or job_id is None, (job_name, job_id)
4807
4883
 
4808
4884
  if job_id is None:
4809
- # generate code to get the job_id
4885
+ # get the job_id
4810
4886
  # if job_name is None, get all job_ids
4811
4887
  # TODO: Only get the latest job_id, since that's the only one we use
4812
- code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
4813
- job_name=job_name)
4814
- returncode, job_ids, stderr = self.run_on_head(handle,
4815
- code,
4816
- stream_logs=False,
4817
- require_outputs=True,
4818
- separate_stderr=True)
4819
- subprocess_utils.handle_returncode(returncode, code,
4820
- 'Failed to sync down logs.',
4821
- stderr)
4822
- job_ids = message_utils.decode_payload(job_ids)
4888
+
4889
+ use_legacy = not handle.is_grpc_enabled_with_flag
4890
+ logger.info(f'handle.is_grpc_enabled_with_flag: '
4891
+ f'{handle.is_grpc_enabled_with_flag}')
4892
+ if not use_legacy:
4893
+ try:
4894
+ request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
4895
+ job_name=job_name)
4896
+ response = backend_utils.invoke_skylet_with_retries(
4897
+ lambda: SkyletClient(handle.get_grpc_channel(
4898
+ )).get_all_managed_job_ids_by_name(request))
4899
+ job_ids = list(response.job_ids)
4900
+ except exceptions.SkyletMethodNotImplementedError:
4901
+ use_legacy = True
4902
+
4903
+ if use_legacy:
4904
+ code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
4905
+ job_name=job_name)
4906
+ returncode, job_ids_payload, stderr = self.run_on_head(
4907
+ handle,
4908
+ code,
4909
+ stream_logs=False,
4910
+ require_outputs=True,
4911
+ separate_stderr=True)
4912
+ subprocess_utils.handle_returncode(returncode, code,
4913
+ 'Failed to sync down logs.',
4914
+ stderr)
4915
+ job_ids = message_utils.decode_payload(job_ids_payload)
4823
4916
  if not job_ids:
4824
4917
  logger.info(f'{colorama.Fore.YELLOW}'
4825
4918
  'No matching job found'
@@ -4847,18 +4940,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4847
4940
  else:
4848
4941
  # get the run_timestamp
4849
4942
  # the function takes in [job_id]
4850
- code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs([str(job_id)])
4851
- returncode, run_timestamps_payload, stderr = self.run_on_head(
4852
- handle,
4853
- code,
4854
- stream_logs=False,
4855
- require_outputs=True,
4856
- separate_stderr=True)
4857
- subprocess_utils.handle_returncode(returncode, code,
4858
- 'Failed to sync logs.', stderr)
4859
- # returns with a dict of {job_id: run_timestamp}
4860
- run_timestamps = message_utils.decode_payload(
4861
- run_timestamps_payload)
4943
+ use_legacy = not handle.is_grpc_enabled_with_flag
4944
+ if not use_legacy:
4945
+ try:
4946
+ log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
4947
+ job_ids=[job_id])
4948
+ log_dirs_response = (
4949
+ backend_utils.invoke_skylet_with_retries(
4950
+ lambda: SkyletClient(handle.get_grpc_channel(
4951
+ )).get_log_dirs_for_jobs(log_dirs_request)))
4952
+ job_log_dirs = log_dirs_response.job_log_dirs
4953
+ # Convert back to the expected format
4954
+ # {job_id: run_timestamp}
4955
+ run_timestamps = {}
4956
+ for jid, log_dir in job_log_dirs.items():
4957
+ run_timestamps[int(jid)] = log_dir
4958
+ except exceptions.SkyletMethodNotImplementedError:
4959
+ use_legacy = True
4960
+
4961
+ if use_legacy:
4962
+ code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
4963
+ [str(job_id)])
4964
+ returncode, run_timestamps_payload, stderr = self.run_on_head(
4965
+ handle,
4966
+ code,
4967
+ stream_logs=False,
4968
+ require_outputs=True,
4969
+ separate_stderr=True)
4970
+ subprocess_utils.handle_returncode(returncode, code,
4971
+ 'Failed to sync logs.',
4972
+ stderr)
4973
+ # returns with a dict of {job_id: run_timestamp}
4974
+ run_timestamps = message_utils.decode_payload(
4975
+ run_timestamps_payload)
4862
4976
  if not run_timestamps:
4863
4977
  logger.info(f'{colorama.Fore.YELLOW}'
4864
4978
  'No matching log directories found'
@@ -4925,6 +5039,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4925
5039
  exist_ok=True)
4926
5040
  log_file = os.path.join(local_log_dir, 'run.log')
4927
5041
 
5042
+ # TODO(kevin): Migrate stream_logs to gRPC
4928
5043
  code = managed_jobs.ManagedJobCodeGen.stream_logs(
4929
5044
  job_name=None,
4930
5045
  job_id=int(job_id),
sky/client/cli/command.py CHANGED
@@ -59,6 +59,7 @@ from sky import task as task_lib
59
59
  from sky.adaptors import common as adaptors_common
60
60
  from sky.client import sdk
61
61
  from sky.client.cli import flags
62
+ from sky.client.cli import table_utils
62
63
  from sky.data import storage_utils
63
64
  from sky.provision.kubernetes import constants as kubernetes_constants
64
65
  from sky.provision.kubernetes import utils as kubernetes_utils
@@ -2125,7 +2126,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
2125
2126
  f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
2126
2127
  f' {common_utils.format_exception(e)}')
2127
2128
  return
2128
- job_tables[cluster] = job_lib.format_job_queue(job_table)
2129
+ job_tables[cluster] = table_utils.format_job_queue(job_table)
2129
2130
 
2130
2131
  subprocess_utils.run_in_parallel(_get_job_queue, clusters)
2131
2132
  user_str = 'all users' if all_users else 'current user'
@@ -5906,23 +5907,31 @@ def local():
5906
5907
  required=False,
5907
5908
  help='Name to use for the kubeconfig context. Defaults to "default". '
5908
5909
  'Used with the ip list.')
5909
- @click.option(
5910
- '--name',
5911
- type=str,
5912
- required=False,
5913
- help='Name of the cluster. Defaults to "skypilot". Used without ip list.')
5914
5910
  @click.option('--password',
5915
5911
  type=str,
5916
5912
  required=False,
5917
5913
  help='Password for the ssh-user to execute sudo commands. '
5918
5914
  'Required only if passwordless sudo is not setup.')
5915
+ @click.option(
5916
+ '--name',
5917
+ type=str,
5918
+ required=False,
5919
+ help='Name of the cluster. Defaults to "skypilot". Used without ip list.')
5920
+ @click.option(
5921
+ '--port-start',
5922
+ type=int,
5923
+ required=False,
5924
+ help='Starting port range for the local kind cluster. Needs to be a '
5925
+ 'multiple of 100. If not given, a random range will be used. '
5926
+ 'Used without ip list.')
5919
5927
  @local.command('up', cls=_DocumentedCodeCommand)
5920
5928
  @flags.config_option(expose_value=False)
5921
5929
  @_add_click_options(flags.COMMON_OPTIONS)
5922
5930
  @usage_lib.entrypoint
5923
5931
  def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
5924
- cleanup: bool, context_name: Optional[str], name: Optional[str],
5925
- password: Optional[str], async_call: bool):
5932
+ cleanup: bool, context_name: Optional[str],
5933
+ password: Optional[str], name: Optional[str],
5934
+ port_start: Optional[int], async_call: bool):
5926
5935
  """Creates a local or remote cluster."""
5927
5936
 
5928
5937
  def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
@@ -5968,7 +5977,7 @@ def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
5968
5977
  f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
5969
5978
 
5970
5979
  request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup,
5971
- context_name, name, password)
5980
+ context_name, password, name, port_start)
5972
5981
  _async_call_or_wait(request_id, async_call, request_name='local up')
5973
5982
 
5974
5983
 
@@ -0,0 +1,34 @@
1
+ """Utilities for formatting tables for CLI output."""
2
+ from typing import List
3
+
4
+ from sky.schemas.api import responses
5
+ from sky.utils import log_utils
6
+
7
+
8
+ def format_job_queue(jobs: List[responses.ClusterJobRecord]):
9
+ """Format the job queue for display.
10
+
11
+ Usage:
12
+ jobs = get_job_queue()
13
+ print(format_job_queue(jobs))
14
+ """
15
+ job_table = log_utils.create_table([
16
+ 'ID', 'NAME', 'USER', 'SUBMITTED', 'STARTED', 'DURATION', 'RESOURCES',
17
+ 'STATUS', 'LOG', 'GIT COMMIT'
18
+ ])
19
+ for job in jobs:
20
+ job_table.add_row([
21
+ job.job_id,
22
+ job.job_name,
23
+ job.username,
24
+ log_utils.readable_time_duration(job.submitted_at),
25
+ log_utils.readable_time_duration(job.start_at),
26
+ log_utils.readable_time_duration(job.start_at,
27
+ job.end_at,
28
+ absolute=True),
29
+ job.resources,
30
+ job.status.colored_str(),
31
+ job.log_path,
32
+ job.metadata.get('git_commit', '-'),
33
+ ])
34
+ return job_table
sky/client/common.py CHANGED
@@ -44,8 +44,10 @@ logger = sky_logging.init_logger(__name__)
44
44
  _DOWNLOAD_CHUNK_BYTES = 8192
45
45
  # The chunk size for the zip file to be uploaded to the API server. We split
46
46
  # the zip file into chunks to avoid network issues for large request body that
47
- # can be caused by NGINX's client_max_body_size.
48
- _UPLOAD_CHUNK_BYTES = 512 * 1024 * 1024
47
+ # can be caused by NGINX's client_max_body_size or Cloudflare's upload limit.
48
+ # As of 09/25/2025, the upload limit for Cloudflare's free plan is 100MiB:
49
+ # https://developers.cloudflare.com/support/troubleshooting/http-status-codes/4xx-client-error/error-413/
50
+ _UPLOAD_CHUNK_BYTES = 100 * 1024 * 1024
49
51
 
50
52
  FILE_UPLOAD_LOGS_DIR = os.path.join(constants.SKY_LOGS_DIRECTORY,
51
53
  'file_uploads')
sky/client/sdk.py CHANGED
@@ -1267,9 +1267,11 @@ def autostop(
1267
1267
  @usage_lib.entrypoint
1268
1268
  @server_common.check_server_healthy_or_start
1269
1269
  @annotations.client_api
1270
- def queue(cluster_name: str,
1271
- skip_finished: bool = False,
1272
- all_users: bool = False) -> server_common.RequestId[List[dict]]:
1270
+ def queue(
1271
+ cluster_name: str,
1272
+ skip_finished: bool = False,
1273
+ all_users: bool = False
1274
+ ) -> server_common.RequestId[List[responses.ClusterJobRecord]]:
1273
1275
  """Gets the job queue of a cluster.
1274
1276
 
1275
1277
  Args:
@@ -1282,8 +1284,8 @@ def queue(cluster_name: str,
1282
1284
  The request ID of the queue request.
1283
1285
 
1284
1286
  Request Returns:
1285
- job_records (List[Dict[str, Any]]): A list of dicts for each job in the
1286
- queue.
1287
+ job_records (List[responses.ClusterJobRecord]): A list of job records
1288
+ for each job in the queue.
1287
1289
 
1288
1290
  .. code-block:: python
1289
1291
 
@@ -1677,8 +1679,9 @@ def local_up(gpus: bool,
1677
1679
  ssh_key: Optional[str],
1678
1680
  cleanup: bool,
1679
1681
  context_name: Optional[str] = None,
1682
+ password: Optional[str] = None,
1680
1683
  name: Optional[str] = None,
1681
- password: Optional[str] = None) -> server_common.RequestId[None]:
1684
+ port_start: Optional[int] = None) -> server_common.RequestId[None]:
1682
1685
  """Launches a Kubernetes cluster on local machines.
1683
1686
 
1684
1687
  Returns:
@@ -1698,8 +1701,9 @@ def local_up(gpus: bool,
1698
1701
  ssh_key=ssh_key,
1699
1702
  cleanup=cleanup,
1700
1703
  context_name=context_name,
1704
+ password=password,
1701
1705
  name=name,
1702
- password=password)
1706
+ port_start=port_start)
1703
1707
  response = server_common.make_authenticated_request(
1704
1708
  'POST', '/local_up', json=json.loads(body.model_dump_json()))
1705
1709
  return server_common.get_request_id(response)
sky/client/sdk_async.py CHANGED
@@ -523,11 +523,11 @@ async def autostop(
523
523
  @usage_lib.entrypoint
524
524
  @annotations.client_api
525
525
  async def queue(
526
- cluster_name: str,
527
- skip_finished: bool = False,
528
- all_users: bool = False,
529
- stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
530
- ) -> List[dict]:
526
+ cluster_name: str,
527
+ skip_finished: bool = False,
528
+ all_users: bool = False,
529
+ stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG
530
+ ) -> List[responses.ClusterJobRecord]:
531
531
  """Async version of queue() that gets the job queue of a cluster."""
532
532
  request_id = await context_utils.to_thread(sdk.queue, cluster_name,
533
533
  skip_finished, all_users)