skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +16 -5
  3. sky/backends/__init__.py +2 -1
  4. sky/backends/backend_utils.py +38 -11
  5. sky/backends/cloud_vm_ray_backend.py +52 -18
  6. sky/client/cli/command.py +264 -25
  7. sky/client/sdk.py +119 -85
  8. sky/clouds/aws.py +10 -7
  9. sky/clouds/azure.py +10 -7
  10. sky/clouds/cloud.py +2 -0
  11. sky/clouds/cudo.py +2 -0
  12. sky/clouds/do.py +10 -7
  13. sky/clouds/fluidstack.py +2 -0
  14. sky/clouds/gcp.py +10 -7
  15. sky/clouds/hyperbolic.py +10 -7
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +27 -9
  18. sky/clouds/lambda_cloud.py +10 -7
  19. sky/clouds/nebius.py +10 -7
  20. sky/clouds/oci.py +10 -7
  21. sky/clouds/paperspace.py +10 -7
  22. sky/clouds/runpod.py +10 -7
  23. sky/clouds/scp.py +10 -7
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +89 -15
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  32. sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  41. sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  48. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  53. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
  58. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  60. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  61. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  62. sky/dashboard/out/clusters/[cluster].html +1 -1
  63. sky/dashboard/out/clusters.html +1 -1
  64. sky/dashboard/out/config.html +1 -1
  65. sky/dashboard/out/index.html +1 -1
  66. sky/dashboard/out/infra/[context].html +1 -1
  67. sky/dashboard/out/infra.html +1 -1
  68. sky/dashboard/out/jobs/[job].html +1 -1
  69. sky/dashboard/out/jobs.html +1 -1
  70. sky/dashboard/out/users.html +1 -1
  71. sky/dashboard/out/volumes.html +1 -0
  72. sky/dashboard/out/workspace/new.html +1 -1
  73. sky/dashboard/out/workspaces/[name].html +1 -1
  74. sky/dashboard/out/workspaces.html +1 -1
  75. sky/data/storage_utils.py +2 -4
  76. sky/exceptions.py +26 -0
  77. sky/execution.py +5 -0
  78. sky/global_user_state.py +263 -20
  79. sky/jobs/client/sdk.py +13 -12
  80. sky/jobs/controller.py +5 -1
  81. sky/jobs/scheduler.py +4 -3
  82. sky/jobs/server/core.py +121 -51
  83. sky/jobs/state.py +15 -0
  84. sky/jobs/utils.py +114 -8
  85. sky/models.py +16 -0
  86. sky/provision/__init__.py +26 -0
  87. sky/provision/kubernetes/__init__.py +3 -0
  88. sky/provision/kubernetes/instance.py +38 -77
  89. sky/provision/kubernetes/utils.py +52 -2
  90. sky/provision/kubernetes/volume.py +147 -0
  91. sky/resources.py +20 -76
  92. sky/serve/client/sdk.py +13 -13
  93. sky/serve/server/core.py +5 -1
  94. sky/server/common.py +40 -5
  95. sky/server/constants.py +5 -1
  96. sky/server/metrics.py +105 -0
  97. sky/server/requests/executor.py +30 -14
  98. sky/server/requests/payloads.py +22 -3
  99. sky/server/requests/requests.py +59 -2
  100. sky/server/rest.py +152 -0
  101. sky/server/server.py +70 -19
  102. sky/server/state.py +20 -0
  103. sky/server/stream_utils.py +8 -3
  104. sky/server/uvicorn.py +153 -13
  105. sky/setup_files/dependencies.py +2 -0
  106. sky/skylet/constants.py +19 -14
  107. sky/task.py +141 -43
  108. sky/templates/jobs-controller.yaml.j2 +12 -1
  109. sky/templates/kubernetes-ray.yml.j2 +31 -2
  110. sky/users/permission.py +2 -0
  111. sky/utils/admin_policy_utils.py +5 -1
  112. sky/utils/cli_utils/status_utils.py +25 -17
  113. sky/utils/command_runner.py +118 -12
  114. sky/utils/command_runner.pyi +57 -0
  115. sky/utils/common_utils.py +9 -1
  116. sky/utils/context.py +3 -1
  117. sky/utils/controller_utils.py +1 -2
  118. sky/utils/resources_utils.py +66 -0
  119. sky/utils/rich_utils.py +6 -0
  120. sky/utils/schemas.py +180 -38
  121. sky/utils/status_lib.py +10 -0
  122. sky/utils/validator.py +11 -1
  123. sky/volumes/__init__.py +0 -0
  124. sky/volumes/client/__init__.py +0 -0
  125. sky/volumes/client/sdk.py +64 -0
  126. sky/volumes/server/__init__.py +0 -0
  127. sky/volumes/server/core.py +199 -0
  128. sky/volumes/server/server.py +85 -0
  129. sky/volumes/utils.py +158 -0
  130. sky/volumes/volume.py +198 -0
  131. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  132. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
  133. sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
  136. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  140. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
  148. sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
  153. sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  156. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
  158. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
  159. /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  160. /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
  161. /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
  162. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  163. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  164. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  165. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/task.py CHANGED
@@ -24,6 +24,7 @@ from sky.skylet import constants
24
24
  from sky.utils import common_utils
25
25
  from sky.utils import schemas
26
26
  from sky.utils import ux_utils
27
+ from sky.volumes import volume as volume_lib
27
28
 
28
29
  if typing.TYPE_CHECKING:
29
30
  import yaml
@@ -246,12 +247,14 @@ class Task:
246
247
  secrets: Optional[Dict[str, str]] = None,
247
248
  workdir: Optional[str] = None,
248
249
  num_nodes: Optional[int] = None,
250
+ volumes: Optional[Dict[str, str]] = None,
249
251
  # Advanced:
250
252
  docker_image: Optional[str] = None,
251
253
  event_callback: Optional[str] = None,
252
254
  blocked_resources: Optional[Iterable['resources_lib.Resources']] = None,
253
255
  # Internal use only.
254
256
  file_mounts_mapping: Optional[Dict[str, str]] = None,
257
+ volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
255
258
  ):
256
259
  """Initializes a Task.
257
260
 
@@ -319,6 +322,7 @@ class Task:
319
322
  self.setup = setup
320
323
  self._envs = envs or {}
321
324
  self._secrets = secrets or {}
325
+ self._volumes = volumes or {}
322
326
 
323
327
  # Validate Docker login configuration early if both envs and secrets
324
328
  # contain Docker variables
@@ -342,8 +346,7 @@ class Task:
342
346
  self.resources: Union[List[sky.Resources],
343
347
  Set[sky.Resources]] = {sky.Resources()}
344
348
  self._service: Optional[service_spec.SkyServiceSpec] = None
345
- # The priority of the managed job running this task.
346
- self._job_priority: Optional[int] = None
349
+
347
350
  # Resources that this task cannot run on.
348
351
  self.blocked_resources = blocked_resources
349
352
 
@@ -362,7 +365,9 @@ class Task:
362
365
  self.best_resources: Optional[sky.Resources] = None
363
366
 
364
367
  # For internal use only.
365
- self.file_mounts_mapping = file_mounts_mapping
368
+ self.file_mounts_mapping: Optional[Dict[str, str]] = file_mounts_mapping
369
+ self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
370
+ volume_mounts)
366
371
 
367
372
  dag = sky.dag.get_current_dag()
368
373
  if dag is not None:
@@ -443,12 +448,9 @@ class Task:
443
448
  if self.file_mounts is None:
444
449
  return
445
450
  for target, source in self.file_mounts.items():
446
- if target.endswith('/') or source.endswith('/'):
447
- with ux_utils.print_exception_no_traceback():
448
- raise ValueError(
449
- 'File mount paths cannot end with a slash '
450
- '(try "/mydir: /mydir" or "/myfile: /myfile"). '
451
- f'Found: target={target} source={source}')
451
+ location = f'file_mounts.{target}: {source}'
452
+ self._validate_mount_path(target, location)
453
+ self._validate_path(source, location)
452
454
  if data_utils.is_cloud_store_url(target):
453
455
  with ux_utils.print_exception_no_traceback():
454
456
  raise ValueError(
@@ -463,17 +465,25 @@ class Task:
463
465
  f'File mount source {source!r} does not exist '
464
466
  'locally. To fix: check if it exists, and correct '
465
467
  'the path.')
466
- # TODO(zhwu): /home/username/sky_workdir as the target path need
467
- # to be filtered out as well.
468
- if (target == constants.SKY_REMOTE_WORKDIR and
469
- self.workdir is not None):
470
- with ux_utils.print_exception_no_traceback():
471
- raise ValueError(
472
- f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
473
- 'destination path of a file mount, as it will be used '
474
- 'by the workdir. If uploading a file/folder to the '
475
- 'workdir is needed, please specify the full path to '
476
- 'the file/folder.')
468
+
469
+ def _validate_mount_path(self, path: str, location: str):
470
+ self._validate_path(path, location)
471
+ # TODO(zhwu): /home/username/sky_workdir as the target path need
472
+ # to be filtered out as well.
473
+ if (path == constants.SKY_REMOTE_WORKDIR and self.workdir is not None):
474
+ with ux_utils.print_exception_no_traceback():
475
+ raise ValueError(
476
+ f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
477
+ 'destination path of a file mount, as it will be used '
478
+ 'by the workdir. If uploading a file/folder to the '
479
+ 'workdir is needed, please specify the full path to '
480
+ 'the file/folder.')
481
+
482
+ def _validate_path(self, path: str, location: str):
483
+ if path.endswith('/'):
484
+ with ux_utils.print_exception_no_traceback():
485
+ raise ValueError('Mount paths cannot end with a slash '
486
+ f'Found: {path} in {location}')
477
487
 
478
488
  def expand_and_validate_workdir(self):
479
489
  """Expand workdir to absolute path and validate it.
@@ -588,6 +598,7 @@ class Task:
588
598
  secrets=config.pop('secrets', None),
589
599
  event_callback=config.pop('event_callback', None),
590
600
  file_mounts_mapping=config.pop('file_mounts_mapping', None),
601
+ volumes=config.pop('volumes', None),
591
602
  )
592
603
 
593
604
  # Create lists to store storage objects inlined in file_mounts.
@@ -712,9 +723,15 @@ class Task:
712
723
  service = service_spec.SkyServiceSpec.from_yaml_config(service)
713
724
  task.set_service(service)
714
725
 
715
- job = config.pop('job', None)
716
- if job is not None and 'priority' in job:
717
- task.set_job_priority(job['priority'])
726
+ volume_mounts = config.pop('volume_mounts', None)
727
+ if volume_mounts is not None:
728
+ task.volume_mounts = []
729
+ for vol in volume_mounts:
730
+ common_utils.validate_schema(vol,
731
+ schemas.get_volume_mount_schema(),
732
+ 'Invalid volume mount config: ')
733
+ volume_mount = volume_lib.VolumeMount.from_yaml_config(vol)
734
+ task.volume_mounts.append(volume_mount)
718
735
 
719
736
  assert not config, f'Invalid task args: {config.keys()}'
720
737
  return task
@@ -750,6 +767,97 @@ class Task:
750
767
  config = {}
751
768
  return Task.from_yaml_config(config)
752
769
 
770
+ def resolve_and_validate_volumes(self) -> None:
771
+ """Resolve volumes config to volume mounts and validate them.
772
+
773
+ Raises:
774
+ exceptions.VolumeNotFoundError: if any volume is not found.
775
+ exceptions.VolumeTopologyConflictError: if there is conflict in the
776
+ volumes and compute topology.
777
+ """
778
+ # Volumes has been resolved, a typical case is that the API server
779
+ # has resolved the volumes and the dag was then submitted to
780
+ # controllers.
781
+ if self.volume_mounts is not None:
782
+ return None
783
+ if not self._volumes:
784
+ return None
785
+ volume_mounts: List[volume_lib.VolumeMount] = []
786
+ for dst_path, vol in self._volumes.items():
787
+ self._validate_mount_path(dst_path, location='volumes')
788
+ # Shortcut for `dst_path: volume_name`
789
+ if isinstance(vol, str):
790
+ volume_mount = volume_lib.VolumeMount.resolve(dst_path, vol)
791
+ elif isinstance(vol, dict):
792
+ assert 'name' in vol, 'Volume name must be set.'
793
+ volume_mount = volume_lib.VolumeMount.resolve(
794
+ dst_path, vol['name'])
795
+ else:
796
+ raise ValueError(f'Invalid volume config: {dst_path}: {vol}')
797
+ volume_mounts.append(volume_mount)
798
+ # Disable certain access modes
799
+ disabled_modes = {}
800
+ if self.num_nodes > 1:
801
+ disabled_modes[
802
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value] = (
803
+ 'access mode ReadWriteOnce is not supported for '
804
+ 'multi-node tasks.')
805
+ disabled_modes[
806
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value] = (
807
+ 'access mode ReadWriteOncePod is not supported for '
808
+ 'multi-node tasks.')
809
+ # TODO(aylei): generalize access mode to all volume types
810
+ # Record the required topology and the volume that requires it, e.g.
811
+ # {'cloud': ('volume_name', 'aws')}
812
+ topology: Dict[str, Tuple[str, Optional[str]]] = {
813
+ 'cloud': ('', None),
814
+ 'region': ('', None),
815
+ 'zone': ('', None),
816
+ }
817
+ for vol in volume_mounts:
818
+ # Check access mode
819
+ access_mode = vol.volume_config.config.get('access_mode', '')
820
+ if access_mode in disabled_modes:
821
+ raise ValueError(f'Volume {vol.volume_name} with '
822
+ f'{disabled_modes[access_mode]}')
823
+ # Check topology
824
+ for key, (vol_name, previous_req) in topology.items():
825
+ req = getattr(vol.volume_config, key)
826
+ if req is not None:
827
+ if previous_req is not None and req != previous_req:
828
+ raise exceptions.VolumeTopologyConflictError(
829
+ f'Volume {vol.volume_name} can only be attached on '
830
+ f'{key}:{req}, which conflicts with another volume '
831
+ f'{vol_name} that requires {key}:{previous_req}.'
832
+ f'Please use different volumes and retry.')
833
+ topology[key] = (vol_name, req)
834
+ # Now we have the topology requirements from the intersection of all
835
+ # volumes. Check if there is topology conflict with the resources.
836
+ # Volume must have no conflict with ALL resources even if user
837
+ # specifies 'any_of' resources to ensure no resources will conflict
838
+ # with the volumes during failover.
839
+
840
+ for res in self.resources:
841
+ for key, (vol_name, vol_req) in topology.items():
842
+ req = getattr(res, key)
843
+ if (req is not None and vol_req is not None and
844
+ str(req) != vol_req):
845
+ raise exceptions.VolumeTopologyConflictError(
846
+ f'The task requires {key}:{req}, which conflicts with '
847
+ f'the volume constraint {key}:{vol_req}. Please '
848
+ f'use different volumes and retry.')
849
+ # No topology conflict, we safely override the topology of resources to
850
+ # satisfy the volume constraints.
851
+ override_params = {}
852
+ for key, (vol_name, vol_req) in topology.items():
853
+ if vol_req is not None:
854
+ if key == 'cloud':
855
+ override_params[key] = sky.CLOUD_REGISTRY.from_str(vol_req)
856
+ else:
857
+ override_params[key] = vol_req
858
+ self.set_resources_override(override_params)
859
+ self.volume_mounts = volume_mounts
860
+
753
861
  @property
754
862
  def num_nodes(self) -> int:
755
863
  return self._num_nodes
@@ -772,6 +880,10 @@ class Task:
772
880
  def secrets(self) -> Dict[str, str]:
773
881
  return self._secrets
774
882
 
883
+ @property
884
+ def volumes(self) -> Dict[str, str]:
885
+ return self._volumes
886
+
775
887
  def update_envs(
776
888
  self, envs: Union[None, List[Tuple[str, str]],
777
889
  Dict[str, str]]) -> 'Task':
@@ -976,23 +1088,6 @@ class Task:
976
1088
  self._service = service
977
1089
  return self
978
1090
 
979
- @property
980
- def job_priority(self) -> Optional[int]:
981
- """The priority of the managed job running this task."""
982
- return self._job_priority
983
-
984
- def set_job_priority(self, priority: int) -> 'Task':
985
- """Sets the job priority for this task.
986
-
987
- Args:
988
- priority: an integer between 0 and 1000.
989
-
990
- Returns:
991
- self: The current task, with job priority set.
992
- """
993
- self._job_priority = priority
994
- return self
995
-
996
1091
  def set_time_estimator(self, func: Callable[['sky.Resources'],
997
1092
  int]) -> 'Task':
998
1093
  """Sets a func mapping resources to estimated time (secs).
@@ -1436,9 +1531,6 @@ class Task:
1436
1531
  if self.service is not None:
1437
1532
  add_if_not_none('service', self.service.to_yaml_config())
1438
1533
 
1439
- if self.job_priority is not None:
1440
- add_if_not_none('job', {'priority': self.job_priority})
1441
-
1442
1534
  add_if_not_none('num_nodes', self.num_nodes)
1443
1535
 
1444
1536
  if self.inputs is not None:
@@ -1478,6 +1570,12 @@ class Task:
1478
1570
  })
1479
1571
 
1480
1572
  add_if_not_none('file_mounts_mapping', self.file_mounts_mapping)
1573
+ add_if_not_none('volumes', self.volumes)
1574
+ if self.volume_mounts is not None:
1575
+ config['volume_mounts'] = [
1576
+ volume_mount.to_yaml_config()
1577
+ for volume_mount in self.volume_mounts
1578
+ ]
1481
1579
  return config
1482
1580
 
1483
1581
  def get_required_cloud_features(
@@ -31,7 +31,9 @@ setup: |
31
31
  {% endif %}
32
32
 
33
33
  run: |
34
+ {%- if consolidation_mode_job_id is none %}
34
35
  {{ sky_activate_python_env }}
36
+ {%- endif %}
35
37
 
36
38
  # Write env vars to a file
37
39
  {%- for env_name, env_value in controller_envs.items() %}
@@ -42,9 +44,18 @@ run: |
42
44
  # Note: The job is already in the `spot` table, marked as PENDING.
43
45
  # CloudVmRayBackend._exec_code_on_head() calls
44
46
  # managed_job_codegen.set_pending() before we get here.
45
- python -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
47
+ {%- if consolidation_mode_job_id is not none %}
48
+ {{sky_python_cmd}} \
49
+ {%- else %}
50
+ python \
51
+ {%- endif %}
52
+ -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
46
53
  --user-yaml-path {{remote_original_user_yaml_path}} \
54
+ {%- if consolidation_mode_job_id is not none %}
55
+ --job-id {{consolidation_mode_job_id}} \
56
+ {%- else %}
47
57
  --job-id $SKYPILOT_INTERNAL_JOB_ID \
58
+ {%- endif %}
48
59
  --env-file {{remote_env_file_path}} \
49
60
  --priority {{priority}}
50
61
 
@@ -243,6 +243,22 @@ provider:
243
243
  # This selector must match the head node pod's selector below.
244
244
  selector:
245
245
  component: {{cluster_name_on_cloud}}-head
246
+ # Headless service mapping hostnames to rest of the worker nodes
247
+ {% for worker_id in range(1, num_nodes) %}
248
+ - apiVersion: v1
249
+ kind: Service
250
+ metadata:
251
+ labels:
252
+ parent: skypilot
253
+ skypilot-cluster: {{cluster_name_on_cloud}}
254
+ skypilot-user: {{ user }}
255
+ name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
256
+ spec:
257
+ selector:
258
+ component: {{cluster_name_on_cloud}}-worker{{ worker_id }}
259
+ clusterIP: None
260
+ {% endfor %}
261
+
246
262
 
247
263
  # Specify the pod type for the ray head node (as configured below).
248
264
  head_node_type: ray_head_default
@@ -255,7 +271,7 @@ available_node_types:
255
271
  metadata:
256
272
  # name will be filled in the provisioner
257
273
  # head node name will be {{cluster_name_on_cloud}}-head, which will match the head node service selector above if a head node
258
- # service is required.
274
+ # service is required. Worker nodes are named {{cluster_name_on_cloud}}-worker{{ node_id }}
259
275
  labels:
260
276
  parent: skypilot
261
277
  # component will be set for the head node pod to be the same as the head node service selector above if a
@@ -287,6 +303,10 @@ available_node_types:
287
303
  serviceAccountName: {{k8s_service_account_name}}
288
304
  automountServiceAccountToken: {{k8s_automount_sa_token}}
289
305
  restartPolicy: {{ "Always" if high_availability else "Never" }}
306
+ {% if volume_mounts %}
307
+ securityContext:
308
+ fsGroup: 1000
309
+ {% endif %}
290
310
 
291
311
  # Add node selector if GPU/TPUs are requested:
292
312
  {% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
@@ -365,6 +385,11 @@ available_node_types:
365
385
  persistentVolumeClaim:
366
386
  claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
367
387
  {% endif %}
388
+ {% for volume_mount in volume_mounts %}
389
+ - name: {{volume_mount.name}}
390
+ persistentVolumeClaim:
391
+ claimName: {{volume_mount.volume_name_on_cloud}}
392
+ {% endfor %}
368
393
  containers:
369
394
  - name: ray-node
370
395
  imagePullPolicy: IfNotPresent
@@ -641,7 +666,7 @@ available_node_types:
641
666
  {% if high_availability %}
642
667
  mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
643
668
  if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
644
- SKYPILOT_HA_RECOVERY_LOG="/tmp/ha_recovery.log"
669
+ SKYPILOT_HA_RECOVERY_LOG="{{ha_recovery_log_path}}"
645
670
  echo "Starting HA recovery at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
646
671
  start_time=$SECONDS
647
672
  retry_count=0
@@ -734,6 +759,10 @@ available_node_types:
734
759
  - name: fusermount-shared-dir
735
760
  mountPath: {{k8s_fusermount_shared_dir}}
736
761
  {% endif %}
762
+ {% for volume_mount in volume_mounts %}
763
+ - name: {{volume_mount.name}}
764
+ mountPath: {{volume_mount.path}}
765
+ {% endfor %}
737
766
  resources:
738
767
  requests:
739
768
  cpu: {{cpus}}
sky/users/permission.py CHANGED
@@ -18,6 +18,8 @@ from sky.utils import common_utils
18
18
 
19
19
  logging.getLogger('casbin.policy').setLevel(sky_logging.ERROR)
20
20
  logging.getLogger('casbin.role').setLevel(sky_logging.ERROR)
21
+ logging.getLogger('casbin.model').setLevel(sky_logging.ERROR)
22
+ logging.getLogger('casbin.rbac').setLevel(sky_logging.ERROR)
21
23
  logger = sky_logging.init_logger(__name__)
22
24
 
23
25
  # Filelocks for the policy update.
@@ -140,13 +140,17 @@ def apply(
140
140
  at_client_side)
141
141
  try:
142
142
  mutated_user_request = policy.apply(user_request)
143
+ # Avoid duplicate exception wrapping.
144
+ except exceptions.UserRequestRejectedByPolicy as e:
145
+ with ux_utils.print_exception_no_traceback():
146
+ raise e
143
147
  except Exception as e: # pylint: disable=broad-except
144
148
  with ux_utils.print_exception_no_traceback():
145
149
  raise exceptions.UserRequestRejectedByPolicy(
146
150
  f'{colorama.Fore.RED}User request rejected by policy '
147
151
  f'{policy!r}{colorama.Fore.RESET}: '
148
152
  f'{common_utils.format_exception(e, use_bracket=True)}'
149
- ) from e
153
+ ) from None
150
154
  if mutated_config is None:
151
155
  mutated_config = mutated_user_request.skypilot_config
152
156
  else:
@@ -7,7 +7,6 @@ import colorama
7
7
 
8
8
  from sky import backends
9
9
  from sky.utils import common_utils
10
- from sky.utils import controller_utils
11
10
  from sky.utils import log_utils
12
11
  from sky.utils import resources_utils
13
12
  from sky.utils import status_lib
@@ -137,7 +136,8 @@ def get_total_cost_of_displayed_records(
137
136
 
138
137
  def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
139
138
  show_all: bool,
140
- controller_name: Optional[str] = None):
139
+ controller_name: Optional[str] = None,
140
+ days: Optional[int] = None):
141
141
  """Compute cluster table values and display for cost report.
142
142
 
143
143
  For each cluster, this shows: cluster name, resources, launched time,
@@ -200,23 +200,21 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
200
200
  cluster_table.add_row(row)
201
201
 
202
202
  if cluster_records:
203
+ controller_record = cluster_records[0]
203
204
  if controller_name is not None:
204
- controller = controller_utils.Controllers.from_name(controller_name)
205
- if controller is None:
206
- raise ValueError(f'Controller {controller_name} not found.')
207
- controller_handle: backends.CloudVmRayResourceHandle = (
208
- cluster_records[0]['handle'])
209
- autostop_config = (
210
- controller_handle.launched_resources.autostop_config)
211
- if autostop_config is not None:
205
+ autostop = controller_record.get('autostop', None)
206
+ autostop_str = ''
207
+ if autostop is not None:
212
208
  autostop_str = (f'{colorama.Style.DIM} (will be autostopped if '
213
- f'idle for {autostop_config.idle_minutes}min)'
209
+ f'idle for {autostop}min)'
214
210
  f'{colorama.Style.RESET_ALL}')
215
211
  click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
216
212
  f'{controller_name}{colorama.Style.RESET_ALL}'
217
213
  f'{autostop_str}')
218
214
  else:
219
- click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
215
+ days_str = '' if days is None else f' (last {days} days)'
216
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
217
+ f'Clusters{days_str}'
220
218
  f'{colorama.Style.RESET_ALL}')
221
219
  click.echo(cluster_table)
222
220
 
@@ -345,7 +343,9 @@ def _get_infra(cluster_record: _ClusterRecord, truncate: bool = True) -> str:
345
343
 
346
344
 
347
345
  def _get_status_value_for_cost_report(
348
- cluster_cost_report_record: _ClusterCostReportRecord) -> int:
346
+ cluster_cost_report_record: _ClusterCostReportRecord,
347
+ truncate: bool = True) -> int:
348
+ del truncate
349
349
  status = cluster_cost_report_record['status']
350
350
  if status is None:
351
351
  return -1
@@ -353,7 +353,9 @@ def _get_status_value_for_cost_report(
353
353
 
354
354
 
355
355
  def _get_status_for_cost_report(
356
- cluster_cost_report_record: _ClusterCostReportRecord) -> str:
356
+ cluster_cost_report_record: _ClusterCostReportRecord,
357
+ truncate: bool = True) -> str:
358
+ del truncate
357
359
  status = cluster_cost_report_record['status']
358
360
  if status is None:
359
361
  return f'{colorama.Style.DIM}TERMINATED{colorama.Style.RESET_ALL}'
@@ -361,7 +363,9 @@ def _get_status_for_cost_report(
361
363
 
362
364
 
363
365
  def _get_resources_for_cost_report(
364
- cluster_cost_report_record: _ClusterCostReportRecord) -> str:
366
+ cluster_cost_report_record: _ClusterCostReportRecord,
367
+ truncate: bool = True) -> str:
368
+ del truncate
365
369
  launched_nodes = cluster_cost_report_record['num_nodes']
366
370
  launched_resources = cluster_cost_report_record['resources']
367
371
 
@@ -373,7 +377,9 @@ def _get_resources_for_cost_report(
373
377
 
374
378
 
375
379
  def _get_price_for_cost_report(
376
- cluster_cost_report_record: _ClusterCostReportRecord) -> str:
380
+ cluster_cost_report_record: _ClusterCostReportRecord,
381
+ truncate: bool = True) -> str:
382
+ del truncate
377
383
  launched_nodes = cluster_cost_report_record['num_nodes']
378
384
  launched_resources = cluster_cost_report_record['resources']
379
385
 
@@ -383,7 +389,9 @@ def _get_price_for_cost_report(
383
389
 
384
390
 
385
391
  def _get_estimated_cost_for_cost_report(
386
- cluster_cost_report_record: _ClusterCostReportRecord) -> str:
392
+ cluster_cost_report_record: _ClusterCostReportRecord,
393
+ truncate: bool = True) -> str:
394
+ del truncate
387
395
  cost = cluster_cost_report_record['total_cost']
388
396
 
389
397
  if not cost: