skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +1 -6
  3. sky/backends/backend_utils.py +26 -11
  4. sky/backends/cloud_vm_ray_backend.py +16 -5
  5. sky/client/cli/command.py +232 -9
  6. sky/client/sdk.py +195 -91
  7. sky/clouds/aws.py +10 -7
  8. sky/clouds/azure.py +10 -7
  9. sky/clouds/cloud.py +2 -0
  10. sky/clouds/cudo.py +2 -0
  11. sky/clouds/do.py +10 -7
  12. sky/clouds/fluidstack.py +2 -0
  13. sky/clouds/gcp.py +10 -7
  14. sky/clouds/hyperbolic.py +10 -7
  15. sky/clouds/ibm.py +2 -0
  16. sky/clouds/kubernetes.py +26 -9
  17. sky/clouds/lambda_cloud.py +10 -7
  18. sky/clouds/nebius.py +10 -7
  19. sky/clouds/oci.py +10 -7
  20. sky/clouds/paperspace.py +10 -7
  21. sky/clouds/runpod.py +10 -7
  22. sky/clouds/scp.py +10 -7
  23. sky/clouds/ssh.py +36 -0
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +21 -0
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/bs6UB9V4Jq10TIZ5x-kBK/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/141-fa5a20cbf401b351.js +11 -0
  31. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/25.76c246239df93d50.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +16 -0
  39. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  40. sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +6 -0
  42. sky/dashboard/out/_next/static/chunks/875.52c962183328b3f2.js +25 -0
  43. sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-9a3ce3170d2edcec.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  49. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/pages/{clusters-7e9736af1c6345a6.js → clusters-f119a5630a1efd61.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  55. sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/workspace/{new-31aa8bdcb7592635.js → new-5b59bce9eb208d84.js} +1 -1
  59. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +1 -0
  62. sky/dashboard/out/_next/static/css/52082cf558ec9705.css +3 -0
  63. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  64. sky/dashboard/out/clusters/[cluster].html +1 -1
  65. sky/dashboard/out/clusters.html +1 -1
  66. sky/dashboard/out/config.html +1 -1
  67. sky/dashboard/out/index.html +1 -1
  68. sky/dashboard/out/infra/[context].html +1 -1
  69. sky/dashboard/out/infra.html +1 -1
  70. sky/dashboard/out/jobs/[job].html +1 -1
  71. sky/dashboard/out/jobs.html +1 -1
  72. sky/dashboard/out/users.html +1 -1
  73. sky/dashboard/out/volumes.html +1 -0
  74. sky/dashboard/out/workspace/new.html +1 -1
  75. sky/dashboard/out/workspaces/[name].html +1 -1
  76. sky/dashboard/out/workspaces.html +1 -1
  77. sky/data/storage_utils.py +2 -4
  78. sky/exceptions.py +15 -0
  79. sky/execution.py +5 -0
  80. sky/global_user_state.py +129 -0
  81. sky/jobs/client/sdk.py +13 -11
  82. sky/jobs/server/core.py +4 -0
  83. sky/models.py +16 -0
  84. sky/provision/__init__.py +26 -0
  85. sky/provision/kubernetes/__init__.py +3 -0
  86. sky/provision/kubernetes/instance.py +38 -77
  87. sky/provision/kubernetes/utils.py +70 -4
  88. sky/provision/kubernetes/volume.py +147 -0
  89. sky/resources.py +20 -76
  90. sky/serve/client/sdk.py +13 -13
  91. sky/serve/server/core.py +5 -1
  92. sky/server/common.py +40 -5
  93. sky/server/constants.py +5 -1
  94. sky/server/metrics.py +105 -0
  95. sky/server/requests/executor.py +30 -14
  96. sky/server/requests/payloads.py +16 -0
  97. sky/server/requests/requests.py +35 -1
  98. sky/server/rest.py +153 -0
  99. sky/server/server.py +70 -43
  100. sky/server/state.py +20 -0
  101. sky/server/stream_utils.py +8 -3
  102. sky/server/uvicorn.py +153 -13
  103. sky/setup_files/dependencies.py +2 -0
  104. sky/skylet/constants.py +19 -3
  105. sky/skypilot_config.py +3 -0
  106. sky/ssh_node_pools/__init__.py +1 -0
  107. sky/ssh_node_pools/core.py +133 -0
  108. sky/ssh_node_pools/server.py +232 -0
  109. sky/task.py +141 -18
  110. sky/templates/kubernetes-ray.yml.j2 +30 -1
  111. sky/users/permission.py +2 -0
  112. sky/utils/context.py +3 -1
  113. sky/utils/kubernetes/deploy_remote_cluster.py +12 -185
  114. sky/utils/kubernetes/ssh_utils.py +221 -0
  115. sky/utils/resources_utils.py +66 -0
  116. sky/utils/rich_utils.py +6 -0
  117. sky/utils/schemas.py +146 -3
  118. sky/utils/status_lib.py +10 -0
  119. sky/utils/validator.py +11 -1
  120. sky/volumes/__init__.py +0 -0
  121. sky/volumes/client/__init__.py +0 -0
  122. sky/volumes/client/sdk.py +64 -0
  123. sky/volumes/server/__init__.py +0 -0
  124. sky/volumes/server/core.py +199 -0
  125. sky/volumes/server/server.py +85 -0
  126. sky/volumes/utils.py +158 -0
  127. sky/volumes/volume.py +198 -0
  128. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/METADATA +2 -1
  129. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/RECORD +135 -115
  130. sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +0 -6
  133. sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
  134. sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/513.309df9e18a9ff005.js +0 -1
  137. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  139. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  140. sky/dashboard/out/_next/static/chunks/843-bde186946d353355.js +0 -11
  141. sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/973-56412c7976b4655b.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  145. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
  151. sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-ecc5a7003776cfa7.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
  156. sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
  157. sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
  158. /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → bs6UB9V4Jq10TIZ5x-kBK}/_ssgManifest.js +0 -0
  159. /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
  160. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/WHEEL +0 -0
  161. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/entry_points.txt +0 -0
  162. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/licenses/LICENSE +0 -0
  163. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/top_level.txt +0 -0
sky/task.py CHANGED
@@ -24,6 +24,7 @@ from sky.skylet import constants
24
24
  from sky.utils import common_utils
25
25
  from sky.utils import schemas
26
26
  from sky.utils import ux_utils
27
+ from sky.volumes import volume as volume_lib
27
28
 
28
29
  if typing.TYPE_CHECKING:
29
30
  import yaml
@@ -246,12 +247,14 @@ class Task:
246
247
  secrets: Optional[Dict[str, str]] = None,
247
248
  workdir: Optional[str] = None,
248
249
  num_nodes: Optional[int] = None,
250
+ volumes: Optional[Dict[str, str]] = None,
249
251
  # Advanced:
250
252
  docker_image: Optional[str] = None,
251
253
  event_callback: Optional[str] = None,
252
254
  blocked_resources: Optional[Iterable['resources_lib.Resources']] = None,
253
255
  # Internal use only.
254
256
  file_mounts_mapping: Optional[Dict[str, str]] = None,
257
+ volume_mounts: Optional[List[volume_lib.VolumeMount]] = None,
255
258
  ):
256
259
  """Initializes a Task.
257
260
 
@@ -319,6 +322,7 @@ class Task:
319
322
  self.setup = setup
320
323
  self._envs = envs or {}
321
324
  self._secrets = secrets or {}
325
+ self._volumes = volumes or {}
322
326
 
323
327
  # Validate Docker login configuration early if both envs and secrets
324
328
  # contain Docker variables
@@ -361,7 +365,9 @@ class Task:
361
365
  self.best_resources: Optional[sky.Resources] = None
362
366
 
363
367
  # For internal use only.
364
- self.file_mounts_mapping = file_mounts_mapping
368
+ self.file_mounts_mapping: Optional[Dict[str, str]] = file_mounts_mapping
369
+ self.volume_mounts: Optional[List[volume_lib.VolumeMount]] = (
370
+ volume_mounts)
365
371
 
366
372
  dag = sky.dag.get_current_dag()
367
373
  if dag is not None:
@@ -442,12 +448,9 @@ class Task:
442
448
  if self.file_mounts is None:
443
449
  return
444
450
  for target, source in self.file_mounts.items():
445
- if target.endswith('/') or source.endswith('/'):
446
- with ux_utils.print_exception_no_traceback():
447
- raise ValueError(
448
- 'File mount paths cannot end with a slash '
449
- '(try "/mydir: /mydir" or "/myfile: /myfile"). '
450
- f'Found: target={target} source={source}')
451
+ location = f'file_mounts.{target}: {source}'
452
+ self._validate_mount_path(target, location)
453
+ self._validate_path(source, location)
451
454
  if data_utils.is_cloud_store_url(target):
452
455
  with ux_utils.print_exception_no_traceback():
453
456
  raise ValueError(
@@ -462,17 +465,25 @@ class Task:
462
465
  f'File mount source {source!r} does not exist '
463
466
  'locally. To fix: check if it exists, and correct '
464
467
  'the path.')
465
- # TODO(zhwu): /home/username/sky_workdir as the target path need
466
- # to be filtered out as well.
467
- if (target == constants.SKY_REMOTE_WORKDIR and
468
- self.workdir is not None):
469
- with ux_utils.print_exception_no_traceback():
470
- raise ValueError(
471
- f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
472
- 'destination path of a file mount, as it will be used '
473
- 'by the workdir. If uploading a file/folder to the '
474
- 'workdir is needed, please specify the full path to '
475
- 'the file/folder.')
468
+
469
+ def _validate_mount_path(self, path: str, location: str):
470
+ self._validate_path(path, location)
471
+ # TODO(zhwu): /home/username/sky_workdir as the target path need
472
+ # to be filtered out as well.
473
+ if (path == constants.SKY_REMOTE_WORKDIR and self.workdir is not None):
474
+ with ux_utils.print_exception_no_traceback():
475
+ raise ValueError(
476
+ f'Cannot use {constants.SKY_REMOTE_WORKDIR!r} as a '
477
+ 'destination path of a file mount, as it will be used '
478
+ 'by the workdir. If uploading a file/folder to the '
479
+ 'workdir is needed, please specify the full path to '
480
+ 'the file/folder.')
481
+
482
+ def _validate_path(self, path: str, location: str):
483
+ if path.endswith('/'):
484
+ with ux_utils.print_exception_no_traceback():
485
+ raise ValueError('Mount paths cannot end with a slash '
486
+ f'Found: {path} in {location}')
476
487
 
477
488
  def expand_and_validate_workdir(self):
478
489
  """Expand workdir to absolute path and validate it.
@@ -587,6 +598,7 @@ class Task:
587
598
  secrets=config.pop('secrets', None),
588
599
  event_callback=config.pop('event_callback', None),
589
600
  file_mounts_mapping=config.pop('file_mounts_mapping', None),
601
+ volumes=config.pop('volumes', None),
590
602
  )
591
603
 
592
604
  # Create lists to store storage objects inlined in file_mounts.
@@ -711,6 +723,16 @@ class Task:
711
723
  service = service_spec.SkyServiceSpec.from_yaml_config(service)
712
724
  task.set_service(service)
713
725
 
726
+ volume_mounts = config.pop('volume_mounts', None)
727
+ if volume_mounts is not None:
728
+ task.volume_mounts = []
729
+ for vol in volume_mounts:
730
+ common_utils.validate_schema(vol,
731
+ schemas.get_volume_mount_schema(),
732
+ 'Invalid volume mount config: ')
733
+ volume_mount = volume_lib.VolumeMount.from_yaml_config(vol)
734
+ task.volume_mounts.append(volume_mount)
735
+
714
736
  assert not config, f'Invalid task args: {config.keys()}'
715
737
  return task
716
738
 
@@ -745,6 +767,97 @@ class Task:
745
767
  config = {}
746
768
  return Task.from_yaml_config(config)
747
769
 
770
+ def resolve_and_validate_volumes(self) -> None:
771
+ """Resolve volumes config to volume mounts and validate them.
772
+
773
+ Raises:
774
+ exceptions.VolumeNotFoundError: if any volume is not found.
775
+ exceptions.VolumeTopologyConflictError: if there is conflict in the
776
+ volumes and compute topology.
777
+ """
778
+ # Volumes has been resolved, a typical case is that the API server
779
+ # has resolved the volumes and the dag was then submitted to
780
+ # controllers.
781
+ if self.volume_mounts is not None:
782
+ return None
783
+ if not self._volumes:
784
+ return None
785
+ volume_mounts: List[volume_lib.VolumeMount] = []
786
+ for dst_path, vol in self._volumes.items():
787
+ self._validate_mount_path(dst_path, location='volumes')
788
+ # Shortcut for `dst_path: volume_name`
789
+ if isinstance(vol, str):
790
+ volume_mount = volume_lib.VolumeMount.resolve(dst_path, vol)
791
+ elif isinstance(vol, dict):
792
+ assert 'name' in vol, 'Volume name must be set.'
793
+ volume_mount = volume_lib.VolumeMount.resolve(
794
+ dst_path, vol['name'])
795
+ else:
796
+ raise ValueError(f'Invalid volume config: {dst_path}: {vol}')
797
+ volume_mounts.append(volume_mount)
798
+ # Disable certain access modes
799
+ disabled_modes = {}
800
+ if self.num_nodes > 1:
801
+ disabled_modes[
802
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value] = (
803
+ 'access mode ReadWriteOnce is not supported for '
804
+ 'multi-node tasks.')
805
+ disabled_modes[
806
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value] = (
807
+ 'access mode ReadWriteOncePod is not supported for '
808
+ 'multi-node tasks.')
809
+ # TODO(aylei): generalize access mode to all volume types
810
+ # Record the required topology and the volume that requires it, e.g.
811
+ # {'cloud': ('volume_name', 'aws')}
812
+ topology: Dict[str, Tuple[str, Optional[str]]] = {
813
+ 'cloud': ('', None),
814
+ 'region': ('', None),
815
+ 'zone': ('', None),
816
+ }
817
+ for vol in volume_mounts:
818
+ # Check access mode
819
+ access_mode = vol.volume_config.config.get('access_mode', '')
820
+ if access_mode in disabled_modes:
821
+ raise ValueError(f'Volume {vol.volume_name} with '
822
+ f'{disabled_modes[access_mode]}')
823
+ # Check topology
824
+ for key, (vol_name, previous_req) in topology.items():
825
+ req = getattr(vol.volume_config, key)
826
+ if req is not None:
827
+ if previous_req is not None and req != previous_req:
828
+ raise exceptions.VolumeTopologyConflictError(
829
+ f'Volume {vol.volume_name} can only be attached on '
830
+ f'{key}:{req}, which conflicts with another volume '
831
+ f'{vol_name} that requires {key}:{previous_req}.'
832
+ f'Please use different volumes and retry.')
833
+ topology[key] = (vol_name, req)
834
+ # Now we have the topology requirements from the intersection of all
835
+ # volumes. Check if there is topology conflict with the resources.
836
+ # Volume must have no conflict with ALL resources even if user
837
+ # specifies 'any_of' resources to ensure no resources will conflict
838
+ # with the volumes during failover.
839
+
840
+ for res in self.resources:
841
+ for key, (vol_name, vol_req) in topology.items():
842
+ req = getattr(res, key)
843
+ if (req is not None and vol_req is not None and
844
+ str(req) != vol_req):
845
+ raise exceptions.VolumeTopologyConflictError(
846
+ f'The task requires {key}:{req}, which conflicts with '
847
+ f'the volume constraint {key}:{vol_req}. Please '
848
+ f'use different volumes and retry.')
849
+ # No topology conflict, we safely override the topology of resources to
850
+ # satisfy the volume constraints.
851
+ override_params = {}
852
+ for key, (vol_name, vol_req) in topology.items():
853
+ if vol_req is not None:
854
+ if key == 'cloud':
855
+ override_params[key] = sky.CLOUD_REGISTRY.from_str(vol_req)
856
+ else:
857
+ override_params[key] = vol_req
858
+ self.set_resources_override(override_params)
859
+ self.volume_mounts = volume_mounts
860
+
748
861
  @property
749
862
  def num_nodes(self) -> int:
750
863
  return self._num_nodes
@@ -767,6 +880,10 @@ class Task:
767
880
  def secrets(self) -> Dict[str, str]:
768
881
  return self._secrets
769
882
 
883
+ @property
884
+ def volumes(self) -> Dict[str, str]:
885
+ return self._volumes
886
+
770
887
  def update_envs(
771
888
  self, envs: Union[None, List[Tuple[str, str]],
772
889
  Dict[str, str]]) -> 'Task':
@@ -1453,6 +1570,12 @@ class Task:
1453
1570
  })
1454
1571
 
1455
1572
  add_if_not_none('file_mounts_mapping', self.file_mounts_mapping)
1573
+ add_if_not_none('volumes', self.volumes)
1574
+ if self.volume_mounts is not None:
1575
+ config['volume_mounts'] = [
1576
+ volume_mount.to_yaml_config()
1577
+ for volume_mount in self.volume_mounts
1578
+ ]
1456
1579
  return config
1457
1580
 
1458
1581
  def get_required_cloud_features(
@@ -243,6 +243,22 @@ provider:
243
243
  # This selector must match the head node pod's selector below.
244
244
  selector:
245
245
  component: {{cluster_name_on_cloud}}-head
246
+ # Headless service mapping hostnames to rest of the worker nodes
247
+ {% for worker_id in range(1, num_nodes) %}
248
+ - apiVersion: v1
249
+ kind: Service
250
+ metadata:
251
+ labels:
252
+ parent: skypilot
253
+ skypilot-cluster: {{cluster_name_on_cloud}}
254
+ skypilot-user: {{ user }}
255
+ name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
256
+ spec:
257
+ selector:
258
+ component: {{cluster_name_on_cloud}}-worker{{ worker_id }}
259
+ clusterIP: None
260
+ {% endfor %}
261
+
246
262
 
247
263
  # Specify the pod type for the ray head node (as configured below).
248
264
  head_node_type: ray_head_default
@@ -255,7 +271,7 @@ available_node_types:
255
271
  metadata:
256
272
  # name will be filled in the provisioner
257
273
  # head node name will be {{cluster_name_on_cloud}}-head, which will match the head node service selector above if a head node
258
- # service is required.
274
+ # service is required. Worker nodes are named {{cluster_name_on_cloud}}-worker{{ node_id }}
259
275
  labels:
260
276
  parent: skypilot
261
277
  # component will be set for the head node pod to be the same as the head node service selector above if a
@@ -287,6 +303,10 @@ available_node_types:
287
303
  serviceAccountName: {{k8s_service_account_name}}
288
304
  automountServiceAccountToken: {{k8s_automount_sa_token}}
289
305
  restartPolicy: {{ "Always" if high_availability else "Never" }}
306
+ {% if volume_mounts %}
307
+ securityContext:
308
+ fsGroup: 1000
309
+ {% endif %}
290
310
 
291
311
  # Add node selector if GPU/TPUs are requested:
292
312
  {% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
@@ -365,6 +385,11 @@ available_node_types:
365
385
  persistentVolumeClaim:
366
386
  claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
367
387
  {% endif %}
388
+ {% for volume_mount in volume_mounts %}
389
+ - name: {{volume_mount.name}}
390
+ persistentVolumeClaim:
391
+ claimName: {{volume_mount.volume_name_on_cloud}}
392
+ {% endfor %}
368
393
  containers:
369
394
  - name: ray-node
370
395
  imagePullPolicy: IfNotPresent
@@ -734,6 +759,10 @@ available_node_types:
734
759
  - name: fusermount-shared-dir
735
760
  mountPath: {{k8s_fusermount_shared_dir}}
736
761
  {% endif %}
762
+ {% for volume_mount in volume_mounts %}
763
+ - name: {{volume_mount.name}}
764
+ mountPath: {{volume_mount.path}}
765
+ {% endfor %}
737
766
  resources:
738
767
  requests:
739
768
  cpu: {{cpus}}
sky/users/permission.py CHANGED
@@ -18,6 +18,8 @@ from sky.utils import common_utils
18
18
 
19
19
  logging.getLogger('casbin.policy').setLevel(sky_logging.ERROR)
20
20
  logging.getLogger('casbin.role').setLevel(sky_logging.ERROR)
21
+ logging.getLogger('casbin.model').setLevel(sky_logging.ERROR)
22
+ logging.getLogger('casbin.rbac').setLevel(sky_logging.ERROR)
21
23
  logger = sky_logging.init_logger(__name__)
22
24
 
23
25
  # Filelocks for the policy update.
sky/utils/context.py CHANGED
@@ -254,7 +254,9 @@ class Popen(subprocess.Popen):
254
254
  def __init__(self, *args, **kwargs):
255
255
  env = kwargs.pop('env', None)
256
256
  if env is None:
257
- env = os.environ
257
+ # Pass a copy of current context.environ to avoid race condition
258
+ # when the context is updated after the Popen is created.
259
+ env = os.environ.copy()
258
260
  super().__init__(*args, env=env, **kwargs)
259
261
 
260
262
 
@@ -11,11 +11,12 @@ import shutil
11
11
  import subprocess
12
12
  import sys
13
13
  import tempfile
14
- from typing import Any, Dict, List, Optional, Set
14
+ from typing import List, Set
15
15
 
16
16
  import yaml
17
17
 
18
18
  from sky.utils import ux_utils
19
+ from sky.utils.kubernetes import ssh_utils
19
20
 
20
21
  # Colors for nicer UX
21
22
  RED = '\033[0;31m'
@@ -24,7 +25,6 @@ YELLOW = '\033[1;33m'
24
25
  WARNING_YELLOW = '\x1b[33m'
25
26
  NC = '\033[0m' # No color
26
27
 
27
- DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
28
28
  DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
29
29
  SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
30
30
  NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
@@ -33,29 +33,6 @@ NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
33
33
  SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
34
34
 
35
35
 
36
- class UniqueKeySafeLoader(yaml.SafeLoader):
37
- """Custom YAML loader that raises an error if there are duplicate keys."""
38
-
39
- def construct_mapping(self, node, deep=False):
40
- mapping = {}
41
- for key_node, value_node in node.value:
42
- key = self.construct_object(key_node, deep=deep)
43
- if key in mapping:
44
- raise yaml.constructor.ConstructorError(
45
- note=(f'Duplicate cluster config for cluster {key!r}.\n'
46
- 'Please remove one of them from: '
47
- f'{DEFAULT_SSH_NODE_POOLS_PATH}'))
48
- value = self.construct_object(value_node, deep=deep)
49
- mapping[key] = value
50
- return mapping
51
-
52
-
53
- # Register the custom constructor inside the class
54
- UniqueKeySafeLoader.add_constructor(
55
- yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
56
- UniqueKeySafeLoader.construct_mapping)
57
-
58
-
59
36
  def parse_args():
60
37
  parser = argparse.ArgumentParser(
61
38
  description='Deploy a Kubernetes cluster on remote machines.')
@@ -64,9 +41,9 @@ def parse_args():
64
41
  parser.add_argument(
65
42
  '--ssh-node-pools-file',
66
43
  dest='ssh_node_pools_file',
67
- default=DEFAULT_SSH_NODE_POOLS_PATH,
44
+ default=ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH,
68
45
  help=
69
- f'Path to SSH node pools YAML file (default: {DEFAULT_SSH_NODE_POOLS_PATH})'
46
+ f'Path to SSH node pools YAML file (default: {ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH})'
70
47
  )
71
48
  parser.add_argument(
72
49
  '--kubeconfig-path',
@@ -117,156 +94,6 @@ def parse_args():
117
94
  return parser.parse_args()
118
95
 
119
96
 
120
- def load_ssh_targets(file_path: str) -> Dict[str, Any]:
121
- """Load SSH targets from YAML file."""
122
- if not os.path.exists(file_path):
123
- with ux_utils.print_exception_no_traceback():
124
- raise ValueError(f'SSH Node Pools file not found: {file_path}')
125
-
126
- try:
127
- with open(file_path, 'r', encoding='utf-8') as f:
128
- targets = yaml.load(f, Loader=UniqueKeySafeLoader)
129
- return targets
130
- except yaml.constructor.ConstructorError as e:
131
- with ux_utils.print_exception_no_traceback():
132
- raise ValueError(e.note) from e
133
- except (yaml.YAMLError, IOError, OSError) as e:
134
- with ux_utils.print_exception_no_traceback():
135
- raise ValueError(f'Error loading SSH Node Pools file: {e}') from e
136
-
137
-
138
- def check_host_in_ssh_config(hostname: str) -> bool:
139
- """Return True iff *hostname* matches at least one `Host`/`Match` stanza
140
- in the user's OpenSSH client configuration (including anything pulled in
141
- via Include).
142
-
143
- It calls: ssh -vvG <hostname> -o ConnectTimeout=0
144
- which:
145
- • -G expands the effective config without connecting
146
- • -vv prints debug lines that show which stanzas are applied
147
- • ConnectTimeout=0 avoids a DNS lookup if <hostname> is a FQDN/IP
148
-
149
- No config files are opened or parsed manually.
150
-
151
- Parameters
152
- ----------
153
- hostname : str
154
- The alias/IP/FQDN you want to test.
155
-
156
- Returns
157
- -------
158
- bool
159
- True – a specific stanza matched the host
160
- False – nothing but the global defaults (`Host *`) applied
161
- """
162
- # We direct stderr→stdout because debug output goes to stderr.
163
- proc = subprocess.run(
164
- ['ssh', '-vvG', hostname, '-o', 'ConnectTimeout=0'],
165
- text=True,
166
- stdout=subprocess.PIPE,
167
- stderr=subprocess.STDOUT,
168
- check=False, # we only want the text, not to raise
169
- )
170
-
171
- # Look for lines like:
172
- # debug1: ~/.ssh/config line 42: Applying options for <hostname>
173
- # Anything other than "*"
174
- pattern = re.compile(r'^debug\d+: .*Applying options for ([^*].*)$',
175
- re.MULTILINE)
176
-
177
- return bool(pattern.search(proc.stdout))
178
-
179
-
180
- def get_cluster_config(targets: Dict[str, Any],
181
- cluster_name: Optional[str] = None,
182
- file_path: Optional[str] = None) -> Dict[str, Any]:
183
- """Get configuration for specific clusters or all clusters."""
184
- if not targets:
185
- with ux_utils.print_exception_no_traceback():
186
- raise ValueError(
187
- f'No clusters defined in SSH Node Pools file {file_path}')
188
-
189
- if cluster_name:
190
- if cluster_name not in targets:
191
- with ux_utils.print_exception_no_traceback():
192
- raise ValueError(f'Cluster {cluster_name!r} not found in '
193
- f'SSH Node Pools file {file_path}')
194
- return {cluster_name: targets[cluster_name]}
195
-
196
- # Return all clusters if no specific cluster is specified
197
- return targets
198
-
199
-
200
- def prepare_hosts_info(cluster_name: str,
201
- cluster_config: Dict[str, Any]) -> List[Dict[str, str]]:
202
- """Prepare list of hosts with resolved user, identity_file, and password."""
203
- if 'hosts' not in cluster_config or not cluster_config['hosts']:
204
- with ux_utils.print_exception_no_traceback():
205
- raise ValueError(
206
- f'No hosts defined in cluster {cluster_name} configuration')
207
-
208
- # Get cluster-level defaults
209
- cluster_user = cluster_config.get('user', '')
210
- cluster_identity_file = os.path.expanduser(
211
- cluster_config.get('identity_file', ''))
212
- cluster_password = cluster_config.get('password', '')
213
-
214
- # Check if cluster identity file exists
215
- if cluster_identity_file and not os.path.isfile(cluster_identity_file):
216
- with ux_utils.print_exception_no_traceback():
217
- raise ValueError(
218
- f'SSH Identity File Missing: {cluster_identity_file}')
219
-
220
- hosts_info = []
221
- for host in cluster_config['hosts']:
222
- # Host can be a string (IP or SSH config hostname) or a dict
223
- if isinstance(host, str):
224
- # Check if this is an SSH config hostname
225
- is_ssh_config_host = check_host_in_ssh_config(host)
226
-
227
- hosts_info.append({
228
- 'ip': host,
229
- 'user': '' if is_ssh_config_host else cluster_user,
230
- 'identity_file': '' if is_ssh_config_host else
231
- cluster_identity_file,
232
- 'password': cluster_password,
233
- 'use_ssh_config': is_ssh_config_host
234
- })
235
- else:
236
- # It's a dict with potential overrides
237
- if 'ip' not in host:
238
- print(
239
- f'{RED}Warning: Host missing \'ip\' field, skipping: {host}{NC}'
240
- )
241
- continue
242
-
243
- # Check if this is an SSH config hostname
244
- is_ssh_config_host = check_host_in_ssh_config(host['ip'])
245
-
246
- # Use host-specific values or fall back to cluster defaults
247
- host_user = '' if is_ssh_config_host else host.get(
248
- 'user', cluster_user)
249
- host_identity_file = os.path.expanduser(
250
- '' if is_ssh_config_host else host.
251
- get('identity_file', cluster_identity_file))
252
- host_password = host.get('password', cluster_password)
253
-
254
- if host_identity_file and not os.path.isfile(host_identity_file):
255
- with ux_utils.print_exception_no_traceback():
256
- raise ValueError(
257
- f'SSH Identity File Missing: {host_identity_file}')
258
-
259
- hosts_info.append({
260
- 'ip': host['ip'],
261
- 'user': host_user,
262
- 'identity_file': host_identity_file,
263
- 'password': host_password,
264
- 'use_ssh_config': is_ssh_config_host
265
- })
266
-
267
- return hosts_info
268
-
269
-
270
97
  def run_command(cmd, shell=False):
271
98
  """Run a local command and return the output."""
272
99
  process = subprocess.run(cmd,
@@ -675,10 +502,10 @@ def main():
675
502
  password = args.password
676
503
 
677
504
  # Check if hosts are in SSH config
678
- head_use_ssh_config = global_use_ssh_config or check_host_in_ssh_config(
505
+ head_use_ssh_config = global_use_ssh_config or ssh_utils.check_host_in_ssh_config(
679
506
  head_node)
680
507
  worker_use_ssh_config = [
681
- global_use_ssh_config or check_host_in_ssh_config(node)
508
+ global_use_ssh_config or ssh_utils.check_host_in_ssh_config(node)
682
509
  for node in worker_nodes
683
510
  ]
684
511
 
@@ -688,10 +515,9 @@ def main():
688
515
  kubeconfig_path, args.cleanup)
689
516
  else:
690
517
  # Using YAML configuration
691
- targets = load_ssh_targets(args.ssh_node_pools_file)
692
- clusters_config = get_cluster_config(targets,
693
- args.infra,
694
- file_path=args.ssh_node_pools_file)
518
+ targets = ssh_utils.load_ssh_targets(args.ssh_node_pools_file)
519
+ clusters_config = ssh_utils.get_cluster_config(
520
+ targets, args.infra, file_path=args.ssh_node_pools_file)
695
521
 
696
522
  # Print information about clusters being processed
697
523
  num_clusters = len(clusters_config)
@@ -705,7 +531,8 @@ def main():
705
531
  print(f'SKYPILOT_CURRENT_CLUSTER: {cluster_name}')
706
532
  print(
707
533
  f'{YELLOW}==== Deploying cluster: {cluster_name} ====${NC}')
708
- hosts_info = prepare_hosts_info(cluster_name, cluster_config)
534
+ hosts_info = ssh_utils.prepare_hosts_info(
535
+ cluster_name, cluster_config)
709
536
 
710
537
  if not hosts_info:
711
538
  print(
@@ -744,7 +571,7 @@ def main():
744
571
  f'Cluster configuration has changed for field {key!r}. '
745
572
  f'Previous value: {history.get(key)}, '
746
573
  f'Current value: {cluster_config.get(key)}')
747
- history_hosts_info = prepare_hosts_info(
574
+ history_hosts_info = ssh_utils.prepare_hosts_info(
748
575
  cluster_name, history)
749
576
  if not args.cleanup and history_hosts_info[0] != hosts_info[
750
577
  0]: