skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +1 -6
- sky/backends/backend_utils.py +26 -11
- sky/backends/cloud_vm_ray_backend.py +16 -5
- sky/client/cli/command.py +232 -9
- sky/client/sdk.py +195 -91
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +26 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/ssh.py +36 -0
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +21 -0
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/bs6UB9V4Jq10TIZ5x-kBK/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/141-fa5a20cbf401b351.js +11 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/25.76c246239df93d50.js +6 -0
- sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +1 -0
- sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +1 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +16 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +1 -0
- sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +6 -0
- sky/dashboard/out/_next/static/chunks/875.52c962183328b3f2.js +25 -0
- sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +1 -0
- sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +1 -0
- sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-9a3ce3170d2edcec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-7e9736af1c6345a6.js → clusters-f119a5630a1efd61.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/{new-31aa8bdcb7592635.js → new-5b59bce9eb208d84.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +1 -0
- sky/dashboard/out/_next/static/css/52082cf558ec9705.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +15 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +129 -0
- sky/jobs/client/sdk.py +13 -11
- sky/jobs/server/core.py +4 -0
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +70 -4
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +16 -0
- sky/server/requests/requests.py +35 -1
- sky/server/rest.py +153 -0
- sky/server/server.py +70 -43
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -3
- sky/skypilot_config.py +3 -0
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +133 -0
- sky/ssh_node_pools/server.py +232 -0
- sky/task.py +141 -18
- sky/templates/kubernetes-ray.yml.j2 +30 -1
- sky/users/permission.py +2 -0
- sky/utils/context.py +3 -1
- sky/utils/kubernetes/deploy_remote_cluster.py +12 -185
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +146 -3
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/RECORD +135 -115
- sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
- sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +0 -1
- sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/513.309df9e18a9ff005.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/843-bde186946d353355.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/973-56412c7976b4655b.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-ecc5a7003776cfa7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
- sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
- sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → bs6UB9V4Jq10TIZ5x-kBK}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/top_level.txt +0 -0
sky/task.py
CHANGED
@@ -24,6 +24,7 @@ from sky.skylet import constants
|
|
24
24
|
from sky.utils import common_utils
|
25
25
|
from sky.utils import schemas
|
26
26
|
from sky.utils import ux_utils
|
27
|
+
from sky.volumes import volume as volume_lib
|
27
28
|
|
28
29
|
if typing.TYPE_CHECKING:
|
29
30
|
import yaml
|
@@ -246,12 +247,14 @@ class Task:
|
|
246
247
|
secrets: Optional[Dict[str, str]] = None,
|
247
248
|
workdir: Optional[str] = None,
|
248
249
|
num_nodes: Optional[int] = None,
|
250
|
+
volumes: Optional[Dict[str, str]] = None,
|
249
251
|
# Advanced:
|
250
252
|
docker_image: Optional[str] = None,
|
251
253
|
event_callback: Optional[str] = None,
|
252
254
|
blocked_resources: Optional[Iterable['resources_lib.Resources']] = None,
|
253
255
|
# Internal use only.
|
254
256
|
file_mounts_mapping: Optional[Dict[str, str]] = None,
|
257
|
+
volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
|
255
258
|
):
|
256
259
|
"""Initializes a Task.
|
257
260
|
|
@@ -319,6 +322,7 @@ class Task:
|
|
319
322
|
self.setup = setup
|
320
323
|
self._envs = envs or {}
|
321
324
|
self._secrets = secrets or {}
|
325
|
+
self._volumes = volumes or {}
|
322
326
|
|
323
327
|
# Validate Docker login configuration early if both envs and secrets
|
324
328
|
# contain Docker variables
|
@@ -361,7 +365,9 @@ class Task:
|
|
361
365
|
self.best_resources: Optional[sky.Resources] = None
|
362
366
|
|
363
367
|
# For internal use only.
|
364
|
-
self.file_mounts_mapping = file_mounts_mapping
|
368
|
+
self.file_mounts_mapping: Optional[Dict[str, str]] = file_mounts_mapping
|
369
|
+
self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
|
370
|
+
volume_mounts)
|
365
371
|
|
366
372
|
dag = sky.dag.get_current_dag()
|
367
373
|
if dag is not None:
|
@@ -442,12 +448,9 @@ class Task:
|
|
442
448
|
if self.file_mounts is None:
|
443
449
|
return
|
444
450
|
for target, source in self.file_mounts.items():
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
'File mount paths cannot end with a slash '
|
449
|
-
'(try "/mydir: /mydir" or "/myfile: /myfile"). '
|
450
|
-
f'Found: target={target} source={source}')
|
451
|
+
location = f'file_mounts.{target}: {source}'
|
452
|
+
self._validate_mount_path(target, location)
|
453
|
+
self._validate_path(source, location)
|
451
454
|
if data_utils.is_cloud_store_url(target):
|
452
455
|
with ux_utils.print_exception_no_traceback():
|
453
456
|
raise ValueError(
|
@@ -462,17 +465,25 @@ class Task:
|
|
462
465
|
f'File mount source {source!r} does not exist '
|
463
466
|
'locally. To fix: check if it exists, and correct '
|
464
467
|
'the path.')
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
468
|
+
|
469
|
+
def _validate_mount_path(self, path: str, location: str):
|
470
|
+
self._validate_path(path, location)
|
471
|
+
# TODO(zhwu): /home/username/sky_workdir as the target path need
|
472
|
+
# to be filtered out as well.
|
473
|
+
if (path == constants.SKY_REMOTE_WORKDIR and self.workdir is not None):
|
474
|
+
with ux_utils.print_exception_no_traceback():
|
475
|
+
raise ValueError(
|
476
|
+
f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
|
477
|
+
'destination path of a file mount, as it will be used '
|
478
|
+
'by the workdir. If uploading a file/folder to the '
|
479
|
+
'workdir is needed, please specify the full path to '
|
480
|
+
'the file/folder.')
|
481
|
+
|
482
|
+
def _validate_path(self, path: str, location: str):
|
483
|
+
if path.endswith('/'):
|
484
|
+
with ux_utils.print_exception_no_traceback():
|
485
|
+
raise ValueError('Mount paths cannot end with a slash '
|
486
|
+
f'Found: {path} in {location}')
|
476
487
|
|
477
488
|
def expand_and_validate_workdir(self):
|
478
489
|
"""Expand workdir to absolute path and validate it.
|
@@ -587,6 +598,7 @@ class Task:
|
|
587
598
|
secrets=config.pop('secrets', None),
|
588
599
|
event_callback=config.pop('event_callback', None),
|
589
600
|
file_mounts_mapping=config.pop('file_mounts_mapping', None),
|
601
|
+
volumes=config.pop('volumes', None),
|
590
602
|
)
|
591
603
|
|
592
604
|
# Create lists to store storage objects inlined in file_mounts.
|
@@ -711,6 +723,16 @@ class Task:
|
|
711
723
|
service = service_spec.SkyServiceSpec.from_yaml_config(service)
|
712
724
|
task.set_service(service)
|
713
725
|
|
726
|
+
volume_mounts = config.pop('volume_mounts', None)
|
727
|
+
if volume_mounts is not None:
|
728
|
+
task.volume_mounts = []
|
729
|
+
for vol in volume_mounts:
|
730
|
+
common_utils.validate_schema(vol,
|
731
|
+
schemas.get_volume_mount_schema(),
|
732
|
+
'Invalid volume mount config: ')
|
733
|
+
volume_mount = volume_lib.VolumeMount.from_yaml_config(vol)
|
734
|
+
task.volume_mounts.append(volume_mount)
|
735
|
+
|
714
736
|
assert not config, f'Invalid task args: {config.keys()}'
|
715
737
|
return task
|
716
738
|
|
@@ -745,6 +767,97 @@ class Task:
|
|
745
767
|
config = {}
|
746
768
|
return Task.from_yaml_config(config)
|
747
769
|
|
770
|
+
def resolve_and_validate_volumes(self) -> None:
|
771
|
+
"""Resolve volumes config to volume mounts and validate them.
|
772
|
+
|
773
|
+
Raises:
|
774
|
+
exceptions.VolumeNotFoundError: if any volume is not found.
|
775
|
+
exceptions.VolumeTopologyConflictError: if there is conflict in the
|
776
|
+
volumes and compute topology.
|
777
|
+
"""
|
778
|
+
# Volumes has been resolved, a typical case is that the API server
|
779
|
+
# has resolved the volumes and the dag was then submitted to
|
780
|
+
# controllers.
|
781
|
+
if self.volume_mounts is not None:
|
782
|
+
return None
|
783
|
+
if not self._volumes:
|
784
|
+
return None
|
785
|
+
volume_mounts: List[volume_lib.VolumeMount] = []
|
786
|
+
for dst_path, vol in self._volumes.items():
|
787
|
+
self._validate_mount_path(dst_path, location='volumes')
|
788
|
+
# Shortcut for `dst_path: volume_name`
|
789
|
+
if isinstance(vol, str):
|
790
|
+
volume_mount = volume_lib.VolumeMount.resolve(dst_path, vol)
|
791
|
+
elif isinstance(vol, dict):
|
792
|
+
assert 'name' in vol, 'Volume name must be set.'
|
793
|
+
volume_mount = volume_lib.VolumeMount.resolve(
|
794
|
+
dst_path, vol['name'])
|
795
|
+
else:
|
796
|
+
raise ValueError(f'Invalid volume config: {dst_path}: {vol}')
|
797
|
+
volume_mounts.append(volume_mount)
|
798
|
+
# Disable certain access modes
|
799
|
+
disabled_modes = {}
|
800
|
+
if self.num_nodes > 1:
|
801
|
+
disabled_modes[
|
802
|
+
volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value] = (
|
803
|
+
'access mode ReadWriteOnce is not supported for '
|
804
|
+
'multi-node tasks.')
|
805
|
+
disabled_modes[
|
806
|
+
volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value] = (
|
807
|
+
'access mode ReadWriteOncePod is not supported for '
|
808
|
+
'multi-node tasks.')
|
809
|
+
# TODO(aylei): generalize access mode to all volume types
|
810
|
+
# Record the required topology and the volume that requires it, e.g.
|
811
|
+
# {'cloud': ('volume_name', 'aws')}
|
812
|
+
topology: Dict[str, Tuple[str, Optional[str]]] = {
|
813
|
+
'cloud': ('', None),
|
814
|
+
'region': ('', None),
|
815
|
+
'zone': ('', None),
|
816
|
+
}
|
817
|
+
for vol in volume_mounts:
|
818
|
+
# Check access mode
|
819
|
+
access_mode = vol.volume_config.config.get('access_mode', '')
|
820
|
+
if access_mode in disabled_modes:
|
821
|
+
raise ValueError(f'Volume {vol.volume_name} with '
|
822
|
+
f'{disabled_modes[access_mode]}')
|
823
|
+
# Check topology
|
824
|
+
for key, (vol_name, previous_req) in topology.items():
|
825
|
+
req = getattr(vol.volume_config, key)
|
826
|
+
if req is not None:
|
827
|
+
if previous_req is not None and req != previous_req:
|
828
|
+
raise exceptions.VolumeTopologyConflictError(
|
829
|
+
f'Volume {vol.volume_name} can only be attached on '
|
830
|
+
f'{key}:{req}, which conflicts with another volume '
|
831
|
+
f'{vol_name} that requires {key}:{previous_req}.'
|
832
|
+
f'Please use different volumes and retry.')
|
833
|
+
topology[key] = (vol_name, req)
|
834
|
+
# Now we have the topology requirements from the intersection of all
|
835
|
+
# volumes. Check if there is topology conflict with the resources.
|
836
|
+
# Volume must have no conflict with ALL resources even if user
|
837
|
+
# specifies 'any_of' resources to ensure no resources will conflict
|
838
|
+
# with the volumes during failover.
|
839
|
+
|
840
|
+
for res in self.resources:
|
841
|
+
for key, (vol_name, vol_req) in topology.items():
|
842
|
+
req = getattr(res, key)
|
843
|
+
if (req is not None and vol_req is not None and
|
844
|
+
str(req) != vol_req):
|
845
|
+
raise exceptions.VolumeTopologyConflictError(
|
846
|
+
f'The task requires {key}:{req}, which conflicts with '
|
847
|
+
f'the volume constraint {key}:{vol_req}. Please '
|
848
|
+
f'use different volumes and retry.')
|
849
|
+
# No topology conflict, we safely override the topology of resources to
|
850
|
+
# satisfy the volume constraints.
|
851
|
+
override_params = {}
|
852
|
+
for key, (vol_name, vol_req) in topology.items():
|
853
|
+
if vol_req is not None:
|
854
|
+
if key == 'cloud':
|
855
|
+
override_params[key] = sky.CLOUD_REGISTRY.from_str(vol_req)
|
856
|
+
else:
|
857
|
+
override_params[key] = vol_req
|
858
|
+
self.set_resources_override(override_params)
|
859
|
+
self.volume_mounts = volume_mounts
|
860
|
+
|
748
861
|
@property
|
749
862
|
def num_nodes(self) -> int:
|
750
863
|
return self._num_nodes
|
@@ -767,6 +880,10 @@ class Task:
|
|
767
880
|
def secrets(self) -> Dict[str, str]:
|
768
881
|
return self._secrets
|
769
882
|
|
883
|
+
@property
|
884
|
+
def volumes(self) -> Dict[str, str]:
|
885
|
+
return self._volumes
|
886
|
+
|
770
887
|
def update_envs(
|
771
888
|
self, envs: Union[None, List[Tuple[str, str]],
|
772
889
|
Dict[str, str]]) -> 'Task':
|
@@ -1453,6 +1570,12 @@ class Task:
|
|
1453
1570
|
})
|
1454
1571
|
|
1455
1572
|
add_if_not_none('file_mounts_mapping', self.file_mounts_mapping)
|
1573
|
+
add_if_not_none('volumes', self.volumes)
|
1574
|
+
if self.volume_mounts is not None:
|
1575
|
+
config['volume_mounts'] = [
|
1576
|
+
volume_mount.to_yaml_config()
|
1577
|
+
for volume_mount in self.volume_mounts
|
1578
|
+
]
|
1456
1579
|
return config
|
1457
1580
|
|
1458
1581
|
def get_required_cloud_features(
|
@@ -243,6 +243,22 @@ provider:
|
|
243
243
|
# This selector must match the head node pod's selector below.
|
244
244
|
selector:
|
245
245
|
component: {{cluster_name_on_cloud}}-head
|
246
|
+
# Headless service mapping hostnames to rest of the worker nodes
|
247
|
+
{% for worker_id in range(1, num_nodes) %}
|
248
|
+
- apiVersion: v1
|
249
|
+
kind: Service
|
250
|
+
metadata:
|
251
|
+
labels:
|
252
|
+
parent: skypilot
|
253
|
+
skypilot-cluster: {{cluster_name_on_cloud}}
|
254
|
+
skypilot-user: {{ user }}
|
255
|
+
name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
256
|
+
spec:
|
257
|
+
selector:
|
258
|
+
component: {{cluster_name_on_cloud}}-worker{{ worker_id }}
|
259
|
+
clusterIP: None
|
260
|
+
{% endfor %}
|
261
|
+
|
246
262
|
|
247
263
|
# Specify the pod type for the ray head node (as configured below).
|
248
264
|
head_node_type: ray_head_default
|
@@ -255,7 +271,7 @@ available_node_types:
|
|
255
271
|
metadata:
|
256
272
|
# name will be filled in the provisioner
|
257
273
|
# head node name will be {{cluster_name_on_cloud}}-head, which will match the head node service selector above if a head node
|
258
|
-
# service is required.
|
274
|
+
# service is required. Worker nodes are named {{cluster_name_on_cloud}}-worker{{ node_id }}
|
259
275
|
labels:
|
260
276
|
parent: skypilot
|
261
277
|
# component will be set for the head node pod to be the same as the head node service selector above if a
|
@@ -287,6 +303,10 @@ available_node_types:
|
|
287
303
|
serviceAccountName: {{k8s_service_account_name}}
|
288
304
|
automountServiceAccountToken: {{k8s_automount_sa_token}}
|
289
305
|
restartPolicy: {{ "Always" if high_availability else "Never" }}
|
306
|
+
{% if volume_mounts %}
|
307
|
+
securityContext:
|
308
|
+
fsGroup: 1000
|
309
|
+
{% endif %}
|
290
310
|
|
291
311
|
# Add node selector if GPU/TPUs are requested:
|
292
312
|
{% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
|
@@ -365,6 +385,11 @@ available_node_types:
|
|
365
385
|
persistentVolumeClaim:
|
366
386
|
claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
|
367
387
|
{% endif %}
|
388
|
+
{% for volume_mount in volume_mounts %}
|
389
|
+
- name: {{volume_mount.name}}
|
390
|
+
persistentVolumeClaim:
|
391
|
+
claimName: {{volume_mount.volume_name_on_cloud}}
|
392
|
+
{% endfor %}
|
368
393
|
containers:
|
369
394
|
- name: ray-node
|
370
395
|
imagePullPolicy: IfNotPresent
|
@@ -734,6 +759,10 @@ available_node_types:
|
|
734
759
|
- name: fusermount-shared-dir
|
735
760
|
mountPath: {{k8s_fusermount_shared_dir}}
|
736
761
|
{% endif %}
|
762
|
+
{% for volume_mount in volume_mounts %}
|
763
|
+
- name: {{volume_mount.name}}
|
764
|
+
mountPath: {{volume_mount.path}}
|
765
|
+
{% endfor %}
|
737
766
|
resources:
|
738
767
|
requests:
|
739
768
|
cpu: {{cpus}}
|
sky/users/permission.py
CHANGED
@@ -18,6 +18,8 @@ from sky.utils import common_utils
|
|
18
18
|
|
19
19
|
logging.getLogger('casbin.policy').setLevel(sky_logging.ERROR)
|
20
20
|
logging.getLogger('casbin.role').setLevel(sky_logging.ERROR)
|
21
|
+
logging.getLogger('casbin.model').setLevel(sky_logging.ERROR)
|
22
|
+
logging.getLogger('casbin.rbac').setLevel(sky_logging.ERROR)
|
21
23
|
logger = sky_logging.init_logger(__name__)
|
22
24
|
|
23
25
|
# Filelocks for the policy update.
|
sky/utils/context.py
CHANGED
@@ -254,7 +254,9 @@ class Popen(subprocess.Popen):
|
|
254
254
|
def __init__(self, *args, **kwargs):
|
255
255
|
env = kwargs.pop('env', None)
|
256
256
|
if env is None:
|
257
|
-
|
257
|
+
# Pass a copy of current context.environ to avoid race condition
|
258
|
+
# when the context is updated after the Popen is created.
|
259
|
+
env = os.environ.copy()
|
258
260
|
super().__init__(*args, env=env, **kwargs)
|
259
261
|
|
260
262
|
|
@@ -11,11 +11,12 @@ import shutil
|
|
11
11
|
import subprocess
|
12
12
|
import sys
|
13
13
|
import tempfile
|
14
|
-
from typing import
|
14
|
+
from typing import List, Set
|
15
15
|
|
16
16
|
import yaml
|
17
17
|
|
18
18
|
from sky.utils import ux_utils
|
19
|
+
from sky.utils.kubernetes import ssh_utils
|
19
20
|
|
20
21
|
# Colors for nicer UX
|
21
22
|
RED = '\033[0;31m'
|
@@ -24,7 +25,6 @@ YELLOW = '\033[1;33m'
|
|
24
25
|
WARNING_YELLOW = '\x1b[33m'
|
25
26
|
NC = '\033[0m' # No color
|
26
27
|
|
27
|
-
DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
|
28
28
|
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
29
29
|
SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
|
30
30
|
NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
|
@@ -33,29 +33,6 @@ NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
|
|
33
33
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
34
34
|
|
35
35
|
|
36
|
-
class UniqueKeySafeLoader(yaml.SafeLoader):
|
37
|
-
"""Custom YAML loader that raises an error if there are duplicate keys."""
|
38
|
-
|
39
|
-
def construct_mapping(self, node, deep=False):
|
40
|
-
mapping = {}
|
41
|
-
for key_node, value_node in node.value:
|
42
|
-
key = self.construct_object(key_node, deep=deep)
|
43
|
-
if key in mapping:
|
44
|
-
raise yaml.constructor.ConstructorError(
|
45
|
-
note=(f'Duplicate cluster config for cluster {key!r}.\n'
|
46
|
-
'Please remove one of them from: '
|
47
|
-
f'{DEFAULT_SSH_NODE_POOLS_PATH}'))
|
48
|
-
value = self.construct_object(value_node, deep=deep)
|
49
|
-
mapping[key] = value
|
50
|
-
return mapping
|
51
|
-
|
52
|
-
|
53
|
-
# Register the custom constructor inside the class
|
54
|
-
UniqueKeySafeLoader.add_constructor(
|
55
|
-
yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
|
56
|
-
UniqueKeySafeLoader.construct_mapping)
|
57
|
-
|
58
|
-
|
59
36
|
def parse_args():
|
60
37
|
parser = argparse.ArgumentParser(
|
61
38
|
description='Deploy a Kubernetes cluster on remote machines.')
|
@@ -64,9 +41,9 @@ def parse_args():
|
|
64
41
|
parser.add_argument(
|
65
42
|
'--ssh-node-pools-file',
|
66
43
|
dest='ssh_node_pools_file',
|
67
|
-
default=DEFAULT_SSH_NODE_POOLS_PATH,
|
44
|
+
default=ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH,
|
68
45
|
help=
|
69
|
-
f'Path to SSH node pools YAML file (default: {DEFAULT_SSH_NODE_POOLS_PATH})'
|
46
|
+
f'Path to SSH node pools YAML file (default: {ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH})'
|
70
47
|
)
|
71
48
|
parser.add_argument(
|
72
49
|
'--kubeconfig-path',
|
@@ -117,156 +94,6 @@ def parse_args():
|
|
117
94
|
return parser.parse_args()
|
118
95
|
|
119
96
|
|
120
|
-
def load_ssh_targets(file_path: str) -> Dict[str, Any]:
|
121
|
-
"""Load SSH targets from YAML file."""
|
122
|
-
if not os.path.exists(file_path):
|
123
|
-
with ux_utils.print_exception_no_traceback():
|
124
|
-
raise ValueError(f'SSH Node Pools file not found: {file_path}')
|
125
|
-
|
126
|
-
try:
|
127
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
128
|
-
targets = yaml.load(f, Loader=UniqueKeySafeLoader)
|
129
|
-
return targets
|
130
|
-
except yaml.constructor.ConstructorError as e:
|
131
|
-
with ux_utils.print_exception_no_traceback():
|
132
|
-
raise ValueError(e.note) from e
|
133
|
-
except (yaml.YAMLError, IOError, OSError) as e:
|
134
|
-
with ux_utils.print_exception_no_traceback():
|
135
|
-
raise ValueError(f'Error loading SSH Node Pools file: {e}') from e
|
136
|
-
|
137
|
-
|
138
|
-
def check_host_in_ssh_config(hostname: str) -> bool:
|
139
|
-
"""Return True iff *hostname* matches at least one `Host`/`Match` stanza
|
140
|
-
in the user's OpenSSH client configuration (including anything pulled in
|
141
|
-
via Include).
|
142
|
-
|
143
|
-
It calls: ssh -vvG <hostname> -o ConnectTimeout=0
|
144
|
-
which:
|
145
|
-
• -G expands the effective config without connecting
|
146
|
-
• -vv prints debug lines that show which stanzas are applied
|
147
|
-
• ConnectTimeout=0 avoids a DNS lookup if <hostname> is a FQDN/IP
|
148
|
-
|
149
|
-
No config files are opened or parsed manually.
|
150
|
-
|
151
|
-
Parameters
|
152
|
-
----------
|
153
|
-
hostname : str
|
154
|
-
The alias/IP/FQDN you want to test.
|
155
|
-
|
156
|
-
Returns
|
157
|
-
-------
|
158
|
-
bool
|
159
|
-
True – a specific stanza matched the host
|
160
|
-
False – nothing but the global defaults (`Host *`) applied
|
161
|
-
"""
|
162
|
-
# We direct stderr→stdout because debug output goes to stderr.
|
163
|
-
proc = subprocess.run(
|
164
|
-
['ssh', '-vvG', hostname, '-o', 'ConnectTimeout=0'],
|
165
|
-
text=True,
|
166
|
-
stdout=subprocess.PIPE,
|
167
|
-
stderr=subprocess.STDOUT,
|
168
|
-
check=False, # we only want the text, not to raise
|
169
|
-
)
|
170
|
-
|
171
|
-
# Look for lines like:
|
172
|
-
# debug1: ~/.ssh/config line 42: Applying options for <hostname>
|
173
|
-
# Anything other than "*"
|
174
|
-
pattern = re.compile(r'^debug\d+: .*Applying options for ([^*].*)$',
|
175
|
-
re.MULTILINE)
|
176
|
-
|
177
|
-
return bool(pattern.search(proc.stdout))
|
178
|
-
|
179
|
-
|
180
|
-
def get_cluster_config(targets: Dict[str, Any],
|
181
|
-
cluster_name: Optional[str] = None,
|
182
|
-
file_path: Optional[str] = None) -> Dict[str, Any]:
|
183
|
-
"""Get configuration for specific clusters or all clusters."""
|
184
|
-
if not targets:
|
185
|
-
with ux_utils.print_exception_no_traceback():
|
186
|
-
raise ValueError(
|
187
|
-
f'No clusters defined in SSH Node Pools file {file_path}')
|
188
|
-
|
189
|
-
if cluster_name:
|
190
|
-
if cluster_name not in targets:
|
191
|
-
with ux_utils.print_exception_no_traceback():
|
192
|
-
raise ValueError(f'Cluster {cluster_name!r} not found in '
|
193
|
-
f'SSH Node Pools file {file_path}')
|
194
|
-
return {cluster_name: targets[cluster_name]}
|
195
|
-
|
196
|
-
# Return all clusters if no specific cluster is specified
|
197
|
-
return targets
|
198
|
-
|
199
|
-
|
200
|
-
def prepare_hosts_info(cluster_name: str,
|
201
|
-
cluster_config: Dict[str, Any]) -> List[Dict[str, str]]:
|
202
|
-
"""Prepare list of hosts with resolved user, identity_file, and password."""
|
203
|
-
if 'hosts' not in cluster_config or not cluster_config['hosts']:
|
204
|
-
with ux_utils.print_exception_no_traceback():
|
205
|
-
raise ValueError(
|
206
|
-
f'No hosts defined in cluster {cluster_name} configuration')
|
207
|
-
|
208
|
-
# Get cluster-level defaults
|
209
|
-
cluster_user = cluster_config.get('user', '')
|
210
|
-
cluster_identity_file = os.path.expanduser(
|
211
|
-
cluster_config.get('identity_file', ''))
|
212
|
-
cluster_password = cluster_config.get('password', '')
|
213
|
-
|
214
|
-
# Check if cluster identity file exists
|
215
|
-
if cluster_identity_file and not os.path.isfile(cluster_identity_file):
|
216
|
-
with ux_utils.print_exception_no_traceback():
|
217
|
-
raise ValueError(
|
218
|
-
f'SSH Identity File Missing: {cluster_identity_file}')
|
219
|
-
|
220
|
-
hosts_info = []
|
221
|
-
for host in cluster_config['hosts']:
|
222
|
-
# Host can be a string (IP or SSH config hostname) or a dict
|
223
|
-
if isinstance(host, str):
|
224
|
-
# Check if this is an SSH config hostname
|
225
|
-
is_ssh_config_host = check_host_in_ssh_config(host)
|
226
|
-
|
227
|
-
hosts_info.append({
|
228
|
-
'ip': host,
|
229
|
-
'user': '' if is_ssh_config_host else cluster_user,
|
230
|
-
'identity_file': '' if is_ssh_config_host else
|
231
|
-
cluster_identity_file,
|
232
|
-
'password': cluster_password,
|
233
|
-
'use_ssh_config': is_ssh_config_host
|
234
|
-
})
|
235
|
-
else:
|
236
|
-
# It's a dict with potential overrides
|
237
|
-
if 'ip' not in host:
|
238
|
-
print(
|
239
|
-
f'{RED}Warning: Host missing \'ip\' field, skipping: {host}{NC}'
|
240
|
-
)
|
241
|
-
continue
|
242
|
-
|
243
|
-
# Check if this is an SSH config hostname
|
244
|
-
is_ssh_config_host = check_host_in_ssh_config(host['ip'])
|
245
|
-
|
246
|
-
# Use host-specific values or fall back to cluster defaults
|
247
|
-
host_user = '' if is_ssh_config_host else host.get(
|
248
|
-
'user', cluster_user)
|
249
|
-
host_identity_file = os.path.expanduser(
|
250
|
-
'' if is_ssh_config_host else host.
|
251
|
-
get('identity_file', cluster_identity_file))
|
252
|
-
host_password = host.get('password', cluster_password)
|
253
|
-
|
254
|
-
if host_identity_file and not os.path.isfile(host_identity_file):
|
255
|
-
with ux_utils.print_exception_no_traceback():
|
256
|
-
raise ValueError(
|
257
|
-
f'SSH Identity File Missing: {host_identity_file}')
|
258
|
-
|
259
|
-
hosts_info.append({
|
260
|
-
'ip': host['ip'],
|
261
|
-
'user': host_user,
|
262
|
-
'identity_file': host_identity_file,
|
263
|
-
'password': host_password,
|
264
|
-
'use_ssh_config': is_ssh_config_host
|
265
|
-
})
|
266
|
-
|
267
|
-
return hosts_info
|
268
|
-
|
269
|
-
|
270
97
|
def run_command(cmd, shell=False):
|
271
98
|
"""Run a local command and return the output."""
|
272
99
|
process = subprocess.run(cmd,
|
@@ -675,10 +502,10 @@ def main():
|
|
675
502
|
password = args.password
|
676
503
|
|
677
504
|
# Check if hosts are in SSH config
|
678
|
-
head_use_ssh_config = global_use_ssh_config or check_host_in_ssh_config(
|
505
|
+
head_use_ssh_config = global_use_ssh_config or ssh_utils.check_host_in_ssh_config(
|
679
506
|
head_node)
|
680
507
|
worker_use_ssh_config = [
|
681
|
-
global_use_ssh_config or check_host_in_ssh_config(node)
|
508
|
+
global_use_ssh_config or ssh_utils.check_host_in_ssh_config(node)
|
682
509
|
for node in worker_nodes
|
683
510
|
]
|
684
511
|
|
@@ -688,10 +515,9 @@ def main():
|
|
688
515
|
kubeconfig_path, args.cleanup)
|
689
516
|
else:
|
690
517
|
# Using YAML configuration
|
691
|
-
targets = load_ssh_targets(args.ssh_node_pools_file)
|
692
|
-
clusters_config = get_cluster_config(
|
693
|
-
|
694
|
-
file_path=args.ssh_node_pools_file)
|
518
|
+
targets = ssh_utils.load_ssh_targets(args.ssh_node_pools_file)
|
519
|
+
clusters_config = ssh_utils.get_cluster_config(
|
520
|
+
targets, args.infra, file_path=args.ssh_node_pools_file)
|
695
521
|
|
696
522
|
# Print information about clusters being processed
|
697
523
|
num_clusters = len(clusters_config)
|
@@ -705,7 +531,8 @@ def main():
|
|
705
531
|
print(f'SKYPILOT_CURRENT_CLUSTER: {cluster_name}')
|
706
532
|
print(
|
707
533
|
f'{YELLOW}==== Deploying cluster: {cluster_name} ====${NC}')
|
708
|
-
hosts_info = prepare_hosts_info(
|
534
|
+
hosts_info = ssh_utils.prepare_hosts_info(
|
535
|
+
cluster_name, cluster_config)
|
709
536
|
|
710
537
|
if not hosts_info:
|
711
538
|
print(
|
@@ -744,7 +571,7 @@ def main():
|
|
744
571
|
f'Cluster configuration has changed for field {key!r}. '
|
745
572
|
f'Previous value: {history.get(key)}, '
|
746
573
|
f'Current value: {cluster_config.get(key)}')
|
747
|
-
history_hosts_info = prepare_hosts_info(
|
574
|
+
history_hosts_info = ssh_utils.prepare_hosts_info(
|
748
575
|
cluster_name, history)
|
749
576
|
if not args.cleanup and history_hosts_info[0] != hosts_info[
|
750
577
|
0]:
|