skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +16 -5
- sky/backends/__init__.py +2 -1
- sky/backends/backend_utils.py +38 -11
- sky/backends/cloud_vm_ray_backend.py +52 -18
- sky/client/cli/command.py +264 -25
- sky/client/sdk.py +119 -85
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +27 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +89 -15
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +26 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +263 -20
- sky/jobs/client/sdk.py +13 -12
- sky/jobs/controller.py +5 -1
- sky/jobs/scheduler.py +4 -3
- sky/jobs/server/core.py +121 -51
- sky/jobs/state.py +15 -0
- sky/jobs/utils.py +114 -8
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +22 -3
- sky/server/requests/requests.py +59 -2
- sky/server/rest.py +152 -0
- sky/server/server.py +70 -19
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -14
- sky/task.py +141 -43
- sky/templates/jobs-controller.yaml.j2 +12 -1
- sky/templates/kubernetes-ray.yml.j2 +31 -2
- sky/users/permission.py +2 -0
- sky/utils/admin_policy_utils.py +5 -1
- sky/utils/cli_utils/status_utils.py +25 -17
- sky/utils/command_runner.py +118 -12
- sky/utils/command_runner.pyi +57 -0
- sky/utils/common_utils.py +9 -1
- sky/utils/context.py +3 -1
- sky/utils/controller_utils.py +1 -2
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +180 -38
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
- sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
- /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/task.py
CHANGED
@@ -24,6 +24,7 @@ from sky.skylet import constants
|
|
24
24
|
from sky.utils import common_utils
|
25
25
|
from sky.utils import schemas
|
26
26
|
from sky.utils import ux_utils
|
27
|
+
from sky.volumes import volume as volume_lib
|
27
28
|
|
28
29
|
if typing.TYPE_CHECKING:
|
29
30
|
import yaml
|
@@ -246,12 +247,14 @@ class Task:
|
|
246
247
|
secrets: Optional[Dict[str, str]] = None,
|
247
248
|
workdir: Optional[str] = None,
|
248
249
|
num_nodes: Optional[int] = None,
|
250
|
+
volumes: Optional[Dict[str, str]] = None,
|
249
251
|
# Advanced:
|
250
252
|
docker_image: Optional[str] = None,
|
251
253
|
event_callback: Optional[str] = None,
|
252
254
|
blocked_resources: Optional[Iterable['resources_lib.Resources']] = None,
|
253
255
|
# Internal use only.
|
254
256
|
file_mounts_mapping: Optional[Dict[str, str]] = None,
|
257
|
+
volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
|
255
258
|
):
|
256
259
|
"""Initializes a Task.
|
257
260
|
|
@@ -319,6 +322,7 @@ class Task:
|
|
319
322
|
self.setup = setup
|
320
323
|
self._envs = envs or {}
|
321
324
|
self._secrets = secrets or {}
|
325
|
+
self._volumes = volumes or {}
|
322
326
|
|
323
327
|
# Validate Docker login configuration early if both envs and secrets
|
324
328
|
# contain Docker variables
|
@@ -342,8 +346,7 @@ class Task:
|
|
342
346
|
self.resources: Union[List[sky.Resources],
|
343
347
|
Set[sky.Resources]] = {sky.Resources()}
|
344
348
|
self._service: Optional[service_spec.SkyServiceSpec] = None
|
345
|
-
|
346
|
-
self._job_priority: Optional[int] = None
|
349
|
+
|
347
350
|
# Resources that this task cannot run on.
|
348
351
|
self.blocked_resources = blocked_resources
|
349
352
|
|
@@ -362,7 +365,9 @@ class Task:
|
|
362
365
|
self.best_resources: Optional[sky.Resources] = None
|
363
366
|
|
364
367
|
# For internal use only.
|
365
|
-
self.file_mounts_mapping = file_mounts_mapping
|
368
|
+
self.file_mounts_mapping: Optional[Dict[str, str]] = file_mounts_mapping
|
369
|
+
self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
|
370
|
+
volume_mounts)
|
366
371
|
|
367
372
|
dag = sky.dag.get_current_dag()
|
368
373
|
if dag is not None:
|
@@ -443,12 +448,9 @@ class Task:
|
|
443
448
|
if self.file_mounts is None:
|
444
449
|
return
|
445
450
|
for target, source in self.file_mounts.items():
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
'File mount paths cannot end with a slash '
|
450
|
-
'(try "/mydir: /mydir" or "/myfile: /myfile"). '
|
451
|
-
f'Found: target={target} source={source}')
|
451
|
+
location = f'file_mounts.{target}: {source}'
|
452
|
+
self._validate_mount_path(target, location)
|
453
|
+
self._validate_path(source, location)
|
452
454
|
if data_utils.is_cloud_store_url(target):
|
453
455
|
with ux_utils.print_exception_no_traceback():
|
454
456
|
raise ValueError(
|
@@ -463,17 +465,25 @@ class Task:
|
|
463
465
|
f'File mount source {source!r} does not exist '
|
464
466
|
'locally. To fix: check if it exists, and correct '
|
465
467
|
'the path.')
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
468
|
+
|
469
|
+
def _validate_mount_path(self, path: str, location: str):
|
470
|
+
self._validate_path(path, location)
|
471
|
+
# TODO(zhwu): /home/username/sky_workdir as the target path need
|
472
|
+
# to be filtered out as well.
|
473
|
+
if (path == constants.SKY_REMOTE_WORKDIR and self.workdir is not None):
|
474
|
+
with ux_utils.print_exception_no_traceback():
|
475
|
+
raise ValueError(
|
476
|
+
f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
|
477
|
+
'destination path of a file mount, as it will be used '
|
478
|
+
'by the workdir. If uploading a file/folder to the '
|
479
|
+
'workdir is needed, please specify the full path to '
|
480
|
+
'the file/folder.')
|
481
|
+
|
482
|
+
def _validate_path(self, path: str, location: str):
|
483
|
+
if path.endswith('/'):
|
484
|
+
with ux_utils.print_exception_no_traceback():
|
485
|
+
raise ValueError('Mount paths cannot end with a slash '
|
486
|
+
f'Found: {path} in {location}')
|
477
487
|
|
478
488
|
def expand_and_validate_workdir(self):
|
479
489
|
"""Expand workdir to absolute path and validate it.
|
@@ -588,6 +598,7 @@ class Task:
|
|
588
598
|
secrets=config.pop('secrets', None),
|
589
599
|
event_callback=config.pop('event_callback', None),
|
590
600
|
file_mounts_mapping=config.pop('file_mounts_mapping', None),
|
601
|
+
volumes=config.pop('volumes', None),
|
591
602
|
)
|
592
603
|
|
593
604
|
# Create lists to store storage objects inlined in file_mounts.
|
@@ -712,9 +723,15 @@ class Task:
|
|
712
723
|
service = service_spec.SkyServiceSpec.from_yaml_config(service)
|
713
724
|
task.set_service(service)
|
714
725
|
|
715
|
-
|
716
|
-
if
|
717
|
-
task.
|
726
|
+
volume_mounts = config.pop('volume_mounts', None)
|
727
|
+
if volume_mounts is not None:
|
728
|
+
task.volume_mounts = []
|
729
|
+
for vol in volume_mounts:
|
730
|
+
common_utils.validate_schema(vol,
|
731
|
+
schemas.get_volume_mount_schema(),
|
732
|
+
'Invalid volume mount config: ')
|
733
|
+
volume_mount = volume_lib.VolumeMount.from_yaml_config(vol)
|
734
|
+
task.volume_mounts.append(volume_mount)
|
718
735
|
|
719
736
|
assert not config, f'Invalid task args: {config.keys()}'
|
720
737
|
return task
|
@@ -750,6 +767,97 @@ class Task:
|
|
750
767
|
config = {}
|
751
768
|
return Task.from_yaml_config(config)
|
752
769
|
|
770
|
+
def resolve_and_validate_volumes(self) -> None:
|
771
|
+
"""Resolve volumes config to volume mounts and validate them.
|
772
|
+
|
773
|
+
Raises:
|
774
|
+
exceptions.VolumeNotFoundError: if any volume is not found.
|
775
|
+
exceptions.VolumeTopologyConflictError: if there is conflict in the
|
776
|
+
volumes and compute topology.
|
777
|
+
"""
|
778
|
+
# Volumes has been resolved, a typical case is that the API server
|
779
|
+
# has resolved the volumes and the dag was then submitted to
|
780
|
+
# controllers.
|
781
|
+
if self.volume_mounts is not None:
|
782
|
+
return None
|
783
|
+
if not self._volumes:
|
784
|
+
return None
|
785
|
+
volume_mounts: List[volume_lib.VolumeMount] = []
|
786
|
+
for dst_path, vol in self._volumes.items():
|
787
|
+
self._validate_mount_path(dst_path, location='volumes')
|
788
|
+
# Shortcut for `dst_path: volume_name`
|
789
|
+
if isinstance(vol, str):
|
790
|
+
volume_mount = volume_lib.VolumeMount.resolve(dst_path, vol)
|
791
|
+
elif isinstance(vol, dict):
|
792
|
+
assert 'name' in vol, 'Volume name must be set.'
|
793
|
+
volume_mount = volume_lib.VolumeMount.resolve(
|
794
|
+
dst_path, vol['name'])
|
795
|
+
else:
|
796
|
+
raise ValueError(f'Invalid volume config: {dst_path}: {vol}')
|
797
|
+
volume_mounts.append(volume_mount)
|
798
|
+
# Disable certain access modes
|
799
|
+
disabled_modes = {}
|
800
|
+
if self.num_nodes > 1:
|
801
|
+
disabled_modes[
|
802
|
+
volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value] = (
|
803
|
+
'access mode ReadWriteOnce is not supported for '
|
804
|
+
'multi-node tasks.')
|
805
|
+
disabled_modes[
|
806
|
+
volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value] = (
|
807
|
+
'access mode ReadWriteOncePod is not supported for '
|
808
|
+
'multi-node tasks.')
|
809
|
+
# TODO(aylei): generalize access mode to all volume types
|
810
|
+
# Record the required topology and the volume that requires it, e.g.
|
811
|
+
# {'cloud': ('volume_name', 'aws')}
|
812
|
+
topology: Dict[str, Tuple[str, Optional[str]]] = {
|
813
|
+
'cloud': ('', None),
|
814
|
+
'region': ('', None),
|
815
|
+
'zone': ('', None),
|
816
|
+
}
|
817
|
+
for vol in volume_mounts:
|
818
|
+
# Check access mode
|
819
|
+
access_mode = vol.volume_config.config.get('access_mode', '')
|
820
|
+
if access_mode in disabled_modes:
|
821
|
+
raise ValueError(f'Volume {vol.volume_name} with '
|
822
|
+
f'{disabled_modes[access_mode]}')
|
823
|
+
# Check topology
|
824
|
+
for key, (vol_name, previous_req) in topology.items():
|
825
|
+
req = getattr(vol.volume_config, key)
|
826
|
+
if req is not None:
|
827
|
+
if previous_req is not None and req != previous_req:
|
828
|
+
raise exceptions.VolumeTopologyConflictError(
|
829
|
+
f'Volume {vol.volume_name} can only be attached on '
|
830
|
+
f'{key}:{req}, which conflicts with another volume '
|
831
|
+
f'{vol_name} that requires {key}:{previous_req}.'
|
832
|
+
f'Please use different volumes and retry.')
|
833
|
+
topology[key] = (vol_name, req)
|
834
|
+
# Now we have the topology requirements from the intersection of all
|
835
|
+
# volumes. Check if there is topology conflict with the resources.
|
836
|
+
# Volume must have no conflict with ALL resources even if user
|
837
|
+
# specifies 'any_of' resources to ensure no resources will conflict
|
838
|
+
# with the volumes during failover.
|
839
|
+
|
840
|
+
for res in self.resources:
|
841
|
+
for key, (vol_name, vol_req) in topology.items():
|
842
|
+
req = getattr(res, key)
|
843
|
+
if (req is not None and vol_req is not None and
|
844
|
+
str(req) != vol_req):
|
845
|
+
raise exceptions.VolumeTopologyConflictError(
|
846
|
+
f'The task requires {key}:{req}, which conflicts with '
|
847
|
+
f'the volume constraint {key}:{vol_req}. Please '
|
848
|
+
f'use different volumes and retry.')
|
849
|
+
# No topology conflict, we safely override the topology of resources to
|
850
|
+
# satisfy the volume constraints.
|
851
|
+
override_params = {}
|
852
|
+
for key, (vol_name, vol_req) in topology.items():
|
853
|
+
if vol_req is not None:
|
854
|
+
if key == 'cloud':
|
855
|
+
override_params[key] = sky.CLOUD_REGISTRY.from_str(vol_req)
|
856
|
+
else:
|
857
|
+
override_params[key] = vol_req
|
858
|
+
self.set_resources_override(override_params)
|
859
|
+
self.volume_mounts = volume_mounts
|
860
|
+
|
753
861
|
@property
|
754
862
|
def num_nodes(self) -> int:
|
755
863
|
return self._num_nodes
|
@@ -772,6 +880,10 @@ class Task:
|
|
772
880
|
def secrets(self) -> Dict[str, str]:
|
773
881
|
return self._secrets
|
774
882
|
|
883
|
+
@property
|
884
|
+
def volumes(self) -> Dict[str, str]:
|
885
|
+
return self._volumes
|
886
|
+
|
775
887
|
def update_envs(
|
776
888
|
self, envs: Union[None, List[Tuple[str, str]],
|
777
889
|
Dict[str, str]]) -> 'Task':
|
@@ -976,23 +1088,6 @@ class Task:
|
|
976
1088
|
self._service = service
|
977
1089
|
return self
|
978
1090
|
|
979
|
-
@property
|
980
|
-
def job_priority(self) -> Optional[int]:
|
981
|
-
"""The priority of the managed job running this task."""
|
982
|
-
return self._job_priority
|
983
|
-
|
984
|
-
def set_job_priority(self, priority: int) -> 'Task':
|
985
|
-
"""Sets the job priority for this task.
|
986
|
-
|
987
|
-
Args:
|
988
|
-
priority: an integer between 0 and 1000.
|
989
|
-
|
990
|
-
Returns:
|
991
|
-
self: The current task, with job priority set.
|
992
|
-
"""
|
993
|
-
self._job_priority = priority
|
994
|
-
return self
|
995
|
-
|
996
1091
|
def set_time_estimator(self, func: Callable[['sky.Resources'],
|
997
1092
|
int]) -> 'Task':
|
998
1093
|
"""Sets a func mapping resources to estimated time (secs).
|
@@ -1436,9 +1531,6 @@ class Task:
|
|
1436
1531
|
if self.service is not None:
|
1437
1532
|
add_if_not_none('service', self.service.to_yaml_config())
|
1438
1533
|
|
1439
|
-
if self.job_priority is not None:
|
1440
|
-
add_if_not_none('job', {'priority': self.job_priority})
|
1441
|
-
|
1442
1534
|
add_if_not_none('num_nodes', self.num_nodes)
|
1443
1535
|
|
1444
1536
|
if self.inputs is not None:
|
@@ -1478,6 +1570,12 @@ class Task:
|
|
1478
1570
|
})
|
1479
1571
|
|
1480
1572
|
add_if_not_none('file_mounts_mapping', self.file_mounts_mapping)
|
1573
|
+
add_if_not_none('volumes', self.volumes)
|
1574
|
+
if self.volume_mounts is not None:
|
1575
|
+
config['volume_mounts'] = [
|
1576
|
+
volume_mount.to_yaml_config()
|
1577
|
+
for volume_mount in self.volume_mounts
|
1578
|
+
]
|
1481
1579
|
return config
|
1482
1580
|
|
1483
1581
|
def get_required_cloud_features(
|
@@ -31,7 +31,9 @@ setup: |
|
|
31
31
|
{% endif %}
|
32
32
|
|
33
33
|
run: |
|
34
|
+
{%- if consolidation_mode_job_id is none %}
|
34
35
|
{{ sky_activate_python_env }}
|
36
|
+
{%- endif %}
|
35
37
|
|
36
38
|
# Write env vars to a file
|
37
39
|
{%- for env_name, env_value in controller_envs.items() %}
|
@@ -42,9 +44,18 @@ run: |
|
|
42
44
|
# Note: The job is already in the `spot` table, marked as PENDING.
|
43
45
|
# CloudVmRayBackend._exec_code_on_head() calls
|
44
46
|
# managed_job_codegen.set_pending() before we get here.
|
45
|
-
|
47
|
+
{%- if consolidation_mode_job_id is not none %}
|
48
|
+
{{sky_python_cmd}} \
|
49
|
+
{%- else %}
|
50
|
+
python \
|
51
|
+
{%- endif %}
|
52
|
+
-u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
|
46
53
|
--user-yaml-path {{remote_original_user_yaml_path}} \
|
54
|
+
{%- if consolidation_mode_job_id is not none %}
|
55
|
+
--job-id {{consolidation_mode_job_id}} \
|
56
|
+
{%- else %}
|
47
57
|
--job-id $SKYPILOT_INTERNAL_JOB_ID \
|
58
|
+
{%- endif %}
|
48
59
|
--env-file {{remote_env_file_path}} \
|
49
60
|
--priority {{priority}}
|
50
61
|
|
@@ -243,6 +243,22 @@ provider:
|
|
243
243
|
# This selector must match the head node pod's selector below.
|
244
244
|
selector:
|
245
245
|
component: {{cluster_name_on_cloud}}-head
|
246
|
+
# Headless service mapping hostnames to rest of the worker nodes
|
247
|
+
{% for worker_id in range(1, num_nodes) %}
|
248
|
+
- apiVersion: v1
|
249
|
+
kind: Service
|
250
|
+
metadata:
|
251
|
+
labels:
|
252
|
+
parent: skypilot
|
253
|
+
skypilot-cluster: {{cluster_name_on_cloud}}
|
254
|
+
skypilot-user: {{ user }}
|
255
|
+
name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
256
|
+
spec:
|
257
|
+
selector:
|
258
|
+
component: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
259
|
+
clusterIP: None
|
260
|
+
{% endfor %}
|
261
|
+
|
246
262
|
|
247
263
|
# Specify the pod type for the ray head node (as configured below).
|
248
264
|
head_node_type: ray_head_default
|
@@ -255,7 +271,7 @@ available_node_types:
|
|
255
271
|
metadata:
|
256
272
|
# name will be filled in the provisioner
|
257
273
|
# head node name will be {{cluster_name_on_cloud}}-head, which will match the head node service selector above if a head node
|
258
|
-
# service is required.
|
274
|
+
# service is required. Worker nodes are named {{cluster_name_on_cloud}}-worker{{ node_id }}
|
259
275
|
labels:
|
260
276
|
parent: skypilot
|
261
277
|
# component will be set for the head node pod to be the same as the head node service selector above if a
|
@@ -287,6 +303,10 @@ available_node_types:
|
|
287
303
|
serviceAccountName: {{k8s_service_account_name}}
|
288
304
|
automountServiceAccountToken: {{k8s_automount_sa_token}}
|
289
305
|
restartPolicy: {{ "Always" if high_availability else "Never" }}
|
306
|
+
{% if volume_mounts %}
|
307
|
+
securityContext:
|
308
|
+
fsGroup: 1000
|
309
|
+
{% endif %}
|
290
310
|
|
291
311
|
# Add node selector if GPU/TPUs are requested:
|
292
312
|
{% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
|
@@ -365,6 +385,11 @@ available_node_types:
|
|
365
385
|
persistentVolumeClaim:
|
366
386
|
claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
|
367
387
|
{% endif %}
|
388
|
+
{% for volume_mount in volume_mounts %}
|
389
|
+
- name: {{volume_mount.name}}
|
390
|
+
persistentVolumeClaim:
|
391
|
+
claimName: {{volume_mount.volume_name_on_cloud}}
|
392
|
+
{% endfor %}
|
368
393
|
containers:
|
369
394
|
- name: ray-node
|
370
395
|
imagePullPolicy: IfNotPresent
|
@@ -641,7 +666,7 @@ available_node_types:
|
|
641
666
|
{% if high_availability %}
|
642
667
|
mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
|
643
668
|
if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
|
644
|
-
SKYPILOT_HA_RECOVERY_LOG="
|
669
|
+
SKYPILOT_HA_RECOVERY_LOG="{{ha_recovery_log_path}}"
|
645
670
|
echo "Starting HA recovery at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
|
646
671
|
start_time=$SECONDS
|
647
672
|
retry_count=0
|
@@ -734,6 +759,10 @@ available_node_types:
|
|
734
759
|
- name: fusermount-shared-dir
|
735
760
|
mountPath: {{k8s_fusermount_shared_dir}}
|
736
761
|
{% endif %}
|
762
|
+
{% for volume_mount in volume_mounts %}
|
763
|
+
- name: {{volume_mount.name}}
|
764
|
+
mountPath: {{volume_mount.path}}
|
765
|
+
{% endfor %}
|
737
766
|
resources:
|
738
767
|
requests:
|
739
768
|
cpu: {{cpus}}
|
sky/users/permission.py
CHANGED
@@ -18,6 +18,8 @@ from sky.utils import common_utils
|
|
18
18
|
|
19
19
|
logging.getLogger('casbin.policy').setLevel(sky_logging.ERROR)
|
20
20
|
logging.getLogger('casbin.role').setLevel(sky_logging.ERROR)
|
21
|
+
logging.getLogger('casbin.model').setLevel(sky_logging.ERROR)
|
22
|
+
logging.getLogger('casbin.rbac').setLevel(sky_logging.ERROR)
|
21
23
|
logger = sky_logging.init_logger(__name__)
|
22
24
|
|
23
25
|
# Filelocks for the policy update.
|
sky/utils/admin_policy_utils.py
CHANGED
@@ -140,13 +140,17 @@ def apply(
|
|
140
140
|
at_client_side)
|
141
141
|
try:
|
142
142
|
mutated_user_request = policy.apply(user_request)
|
143
|
+
# Avoid duplicate exception wrapping.
|
144
|
+
except exceptions.UserRequestRejectedByPolicy as e:
|
145
|
+
with ux_utils.print_exception_no_traceback():
|
146
|
+
raise e
|
143
147
|
except Exception as e: # pylint: disable=broad-except
|
144
148
|
with ux_utils.print_exception_no_traceback():
|
145
149
|
raise exceptions.UserRequestRejectedByPolicy(
|
146
150
|
f'{colorama.Fore.RED}User request rejected by policy '
|
147
151
|
f'{policy!r}{colorama.Fore.RESET}: '
|
148
152
|
f'{common_utils.format_exception(e, use_bracket=True)}'
|
149
|
-
) from
|
153
|
+
) from None
|
150
154
|
if mutated_config is None:
|
151
155
|
mutated_config = mutated_user_request.skypilot_config
|
152
156
|
else:
|
@@ -7,7 +7,6 @@ import colorama
|
|
7
7
|
|
8
8
|
from sky import backends
|
9
9
|
from sky.utils import common_utils
|
10
|
-
from sky.utils import controller_utils
|
11
10
|
from sky.utils import log_utils
|
12
11
|
from sky.utils import resources_utils
|
13
12
|
from sky.utils import status_lib
|
@@ -137,7 +136,8 @@ def get_total_cost_of_displayed_records(
|
|
137
136
|
|
138
137
|
def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
|
139
138
|
show_all: bool,
|
140
|
-
controller_name: Optional[str] = None
|
139
|
+
controller_name: Optional[str] = None,
|
140
|
+
days: Optional[int] = None):
|
141
141
|
"""Compute cluster table values and display for cost report.
|
142
142
|
|
143
143
|
For each cluster, this shows: cluster name, resources, launched time,
|
@@ -200,23 +200,21 @@ def show_cost_report_table(cluster_records: List[_ClusterCostReportRecord],
|
|
200
200
|
cluster_table.add_row(row)
|
201
201
|
|
202
202
|
if cluster_records:
|
203
|
+
controller_record = cluster_records[0]
|
203
204
|
if controller_name is not None:
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
controller_handle: backends.CloudVmRayResourceHandle = (
|
208
|
-
cluster_records[0]['handle'])
|
209
|
-
autostop_config = (
|
210
|
-
controller_handle.launched_resources.autostop_config)
|
211
|
-
if autostop_config is not None:
|
205
|
+
autostop = controller_record.get('autostop', None)
|
206
|
+
autostop_str = ''
|
207
|
+
if autostop is not None:
|
212
208
|
autostop_str = (f'{colorama.Style.DIM} (will be autostopped if '
|
213
|
-
f'idle for {
|
209
|
+
f'idle for {autostop}min)'
|
214
210
|
f'{colorama.Style.RESET_ALL}')
|
215
211
|
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
216
212
|
f'{controller_name}{colorama.Style.RESET_ALL}'
|
217
213
|
f'{autostop_str}')
|
218
214
|
else:
|
219
|
-
|
215
|
+
days_str = '' if days is None else f' (last {days} days)'
|
216
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
217
|
+
f'Clusters{days_str}'
|
220
218
|
f'{colorama.Style.RESET_ALL}')
|
221
219
|
click.echo(cluster_table)
|
222
220
|
|
@@ -345,7 +343,9 @@ def _get_infra(cluster_record: _ClusterRecord, truncate: bool = True) -> str:
|
|
345
343
|
|
346
344
|
|
347
345
|
def _get_status_value_for_cost_report(
|
348
|
-
cluster_cost_report_record: _ClusterCostReportRecord
|
346
|
+
cluster_cost_report_record: _ClusterCostReportRecord,
|
347
|
+
truncate: bool = True) -> int:
|
348
|
+
del truncate
|
349
349
|
status = cluster_cost_report_record['status']
|
350
350
|
if status is None:
|
351
351
|
return -1
|
@@ -353,7 +353,9 @@ def _get_status_value_for_cost_report(
|
|
353
353
|
|
354
354
|
|
355
355
|
def _get_status_for_cost_report(
|
356
|
-
cluster_cost_report_record: _ClusterCostReportRecord
|
356
|
+
cluster_cost_report_record: _ClusterCostReportRecord,
|
357
|
+
truncate: bool = True) -> str:
|
358
|
+
del truncate
|
357
359
|
status = cluster_cost_report_record['status']
|
358
360
|
if status is None:
|
359
361
|
return f'{colorama.Style.DIM}TERMINATED{colorama.Style.RESET_ALL}'
|
@@ -361,7 +363,9 @@ def _get_status_for_cost_report(
|
|
361
363
|
|
362
364
|
|
363
365
|
def _get_resources_for_cost_report(
|
364
|
-
cluster_cost_report_record: _ClusterCostReportRecord
|
366
|
+
cluster_cost_report_record: _ClusterCostReportRecord,
|
367
|
+
truncate: bool = True) -> str:
|
368
|
+
del truncate
|
365
369
|
launched_nodes = cluster_cost_report_record['num_nodes']
|
366
370
|
launched_resources = cluster_cost_report_record['resources']
|
367
371
|
|
@@ -373,7 +377,9 @@ def _get_resources_for_cost_report(
|
|
373
377
|
|
374
378
|
|
375
379
|
def _get_price_for_cost_report(
|
376
|
-
cluster_cost_report_record: _ClusterCostReportRecord
|
380
|
+
cluster_cost_report_record: _ClusterCostReportRecord,
|
381
|
+
truncate: bool = True) -> str:
|
382
|
+
del truncate
|
377
383
|
launched_nodes = cluster_cost_report_record['num_nodes']
|
378
384
|
launched_resources = cluster_cost_report_record['resources']
|
379
385
|
|
@@ -383,7 +389,9 @@ def _get_price_for_cost_report(
|
|
383
389
|
|
384
390
|
|
385
391
|
def _get_estimated_cost_for_cost_report(
|
386
|
-
cluster_cost_report_record: _ClusterCostReportRecord
|
392
|
+
cluster_cost_report_record: _ClusterCostReportRecord,
|
393
|
+
truncate: bool = True) -> str:
|
394
|
+
del truncate
|
387
395
|
cost = cluster_cost_report_record['total_cost']
|
388
396
|
|
389
397
|
if not cost:
|