skypilot-nightly 1.0.0.dev20250529__py3-none-any.whl → 1.0.0.dev20250531__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +99 -16
  3. sky/authentication.py +54 -7
  4. sky/backends/backend_utils.py +37 -24
  5. sky/backends/cloud_vm_ray_backend.py +33 -17
  6. sky/check.py +1 -1
  7. sky/cli.py +43 -15
  8. sky/client/cli.py +43 -15
  9. sky/clouds/cloud.py +20 -0
  10. sky/clouds/cudo.py +2 -0
  11. sky/clouds/do.py +3 -0
  12. sky/clouds/fluidstack.py +3 -0
  13. sky/clouds/gcp.py +10 -3
  14. sky/clouds/kubernetes.py +70 -4
  15. sky/clouds/lambda_cloud.py +3 -0
  16. sky/clouds/nebius.py +57 -14
  17. sky/clouds/paperspace.py +3 -0
  18. sky/clouds/runpod.py +2 -0
  19. sky/clouds/scp.py +3 -0
  20. sky/clouds/vast.py +3 -0
  21. sky/clouds/vsphere.py +3 -0
  22. sky/dashboard/out/404.html +1 -1
  23. sky/dashboard/out/_next/static/bdeJWb62qu7L7FOq1dbXX/_buildManifest.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/236-7458fda7b295f305.js +6 -0
  25. sky/dashboard/out/_next/static/chunks/37-b638675d511d58b4.js +6 -0
  26. sky/dashboard/out/_next/static/chunks/{470-4d003c441839094d.js → 470-9e7a479cc8303baa.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/{173-7db8607cefc20f70.js → 614-3d29f98e0634b179.js} +2 -2
  28. sky/dashboard/out/_next/static/chunks/682-5c12535476a21ce3.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/843-786c36624d5ff61f.js +11 -0
  31. sky/dashboard/out/_next/static/chunks/856-ab9627e7e8ac35e8.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f270e2c9c59fa1a.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-25edb867a41b6b20.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/pages/{clusters-943992b84fd6f4ee.js → clusters-f37ff20f0af29aae.js} +1 -1
  35. sky/dashboard/out/_next/static/chunks/pages/{config-7c48919fe030bc43.js → config-3c6a2dabf56e8cd6.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-909f1ceb0fcf1b99.js → [context]-342bc15bb78ab2e5.js} +1 -1
  37. sky/dashboard/out/_next/static/chunks/pages/{infra-d4c6875c88771e17.js → infra-7b4b8e7fa9fa0827.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-c0c1dff3cd463d9e.js +11 -0
  39. sky/dashboard/out/_next/static/chunks/pages/{jobs-a4efc09e61988f8d.js → jobs-78a6c5ba3e24c0cf.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/{users-b2634885d67c49a6.js → users-89f9212b81d8897e.js} +1 -1
  41. sky/dashboard/out/_next/static/chunks/pages/workspace/{new-579b3203c7c19d84.js → new-198b6e00d7d724c5.js} +1 -1
  42. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-9388e38fac73ee8f.js → [name]-2ce792183b03c341.js} +1 -1
  43. sky/dashboard/out/_next/static/chunks/pages/workspaces-17d41826537196e7.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/webpack-f27c9a32aa3d9c6d.js +1 -0
  45. sky/dashboard/out/_next/static/css/2b3ee34e586949a3.css +3 -0
  46. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  47. sky/dashboard/out/clusters/[cluster].html +1 -1
  48. sky/dashboard/out/clusters.html +1 -1
  49. sky/dashboard/out/config.html +1 -1
  50. sky/dashboard/out/index.html +1 -1
  51. sky/dashboard/out/infra/[context].html +1 -1
  52. sky/dashboard/out/infra.html +1 -1
  53. sky/dashboard/out/jobs/[job].html +1 -1
  54. sky/dashboard/out/jobs.html +1 -1
  55. sky/dashboard/out/users.html +1 -1
  56. sky/dashboard/out/workspace/new.html +1 -1
  57. sky/dashboard/out/workspaces/[name].html +1 -1
  58. sky/dashboard/out/workspaces.html +1 -1
  59. sky/exceptions.py +10 -0
  60. sky/global_user_state.py +149 -1
  61. sky/jobs/client/sdk.py +3 -0
  62. sky/jobs/constants.py +1 -1
  63. sky/jobs/server/core.py +8 -3
  64. sky/jobs/state.py +24 -5
  65. sky/jobs/utils.py +34 -11
  66. sky/provision/gcp/config.py +3 -1
  67. sky/provision/gcp/constants.py +10 -0
  68. sky/provision/kubernetes/utils.py +2 -1
  69. sky/provision/provisioner.py +15 -10
  70. sky/resources.py +44 -3
  71. sky/serve/controller.py +10 -7
  72. sky/serve/replica_managers.py +22 -18
  73. sky/serve/service.py +5 -4
  74. sky/server/common.py +5 -2
  75. sky/server/constants.py +1 -1
  76. sky/server/requests/payloads.py +1 -0
  77. sky/server/stream_utils.py +21 -0
  78. sky/templates/kubernetes-ray.yml.j2 +26 -1
  79. sky/utils/common_utils.py +66 -0
  80. sky/utils/resources_utils.py +26 -0
  81. sky/utils/rich_utils.py +5 -0
  82. sky/utils/schemas.py +23 -1
  83. {skypilot_nightly-1.0.0.dev20250529.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/METADATA +1 -1
  84. {skypilot_nightly-1.0.0.dev20250529.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/RECORD +90 -91
  85. sky/dashboard/out/_next/static/HvNkg7hqKM1p0ptAcdDcF/_buildManifest.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/236-90e5498a5b00ec29.js +0 -6
  87. sky/dashboard/out/_next/static/chunks/303-2c7b0f7af571710b.js +0 -6
  88. sky/dashboard/out/_next/static/chunks/320-afea3ddcc5bd1c6c.js +0 -6
  89. sky/dashboard/out/_next/static/chunks/578-9146658cead92981.js +0 -6
  90. sky/dashboard/out/_next/static/chunks/843-256ec920f6d5f41f.js +0 -11
  91. sky/dashboard/out/_next/static/chunks/856-59a1760784c9e770.js +0 -1
  92. sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +0 -1
  93. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-159bffb2fa34ed54.js +0 -6
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9506c00257d10dbd.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6b80e9e0c6aa16a1.js +0 -6
  96. sky/dashboard/out/_next/static/chunks/pages/workspaces-610c49ae3619ee85.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +0 -1
  98. sky/dashboard/out/_next/static/css/ffd1cd601648c303.css +0 -3
  99. /sky/dashboard/out/_next/static/{HvNkg7hqKM1p0ptAcdDcF → bdeJWb62qu7L7FOq1dbXX}/_ssgManifest.js +0 -0
  100. /sky/dashboard/out/_next/static/chunks/pages/{_app-a631df412d8172de.js → _app-ad1edd7fe17ea796.js} +0 -0
  101. {skypilot_nightly-1.0.0.dev20250529.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/WHEEL +0 -0
  102. {skypilot_nightly-1.0.0.dev20250529.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/entry_points.txt +0 -0
  103. {skypilot_nightly-1.0.0.dev20250529.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/licenses/LICENSE +0 -0
  104. {skypilot_nightly-1.0.0.dev20250529.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/top_level.txt +0 -0
@@ -15,6 +15,7 @@ import colorama
15
15
  import sky
16
16
  from sky import clouds
17
17
  from sky import exceptions
18
+ from sky import global_user_state
18
19
  from sky import provision
19
20
  from sky import sky_logging
20
21
  from sky import skypilot_config
@@ -118,7 +119,7 @@ def bulk_provision(
118
119
  Cloud specific exceptions: If the provisioning process failed, cloud-
119
120
  specific exceptions will be raised by the cloud APIs.
120
121
  """
121
- original_config = common_utils.read_yaml(cluster_yaml)
122
+ original_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
122
123
  head_node_type = original_config['head_node_type']
123
124
  bootstrap_config = provision_common.ProvisionConfig(
124
125
  provider_config=original_config['provider'],
@@ -413,9 +414,11 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
413
414
 
414
415
  def _post_provision_setup(
415
416
  cloud_name: str, cluster_name: resources_utils.ClusterName,
416
- cluster_yaml: str, provision_record: provision_common.ProvisionRecord,
417
+ handle_cluster_yaml: str,
418
+ provision_record: provision_common.ProvisionRecord,
417
419
  custom_resource: Optional[str]) -> provision_common.ClusterInfo:
418
- config_from_yaml = common_utils.read_yaml(cluster_yaml)
420
+ config_from_yaml = global_user_state.get_cluster_yaml_dict(
421
+ handle_cluster_yaml)
419
422
  provider_config = config_from_yaml.get('provider')
420
423
  cluster_info = provision.get_cluster_info(cloud_name,
421
424
  provision_record.region,
@@ -446,7 +449,7 @@ def _post_provision_setup(
446
449
  # TODO(suquark): Move wheel build here in future PRs.
447
450
  # We don't set docker_user here, as we are configuring the VM itself.
448
451
  ssh_credentials = backend_utils.ssh_credential_from_yaml(
449
- cluster_yaml, ssh_user=cluster_info.ssh_user)
452
+ handle_cluster_yaml, ssh_user=cluster_info.ssh_user)
450
453
  docker_config = config_from_yaml.get('docker', {})
451
454
 
452
455
  with rich_utils.safe_status(
@@ -657,7 +660,8 @@ def _post_provision_setup(
657
660
  @timeline.event
658
661
  def post_provision_runtime_setup(
659
662
  cloud_name: str, cluster_name: resources_utils.ClusterName,
660
- cluster_yaml: str, provision_record: provision_common.ProvisionRecord,
663
+ handle_cluster_yaml: str,
664
+ provision_record: provision_common.ProvisionRecord,
661
665
  custom_resource: Optional[str],
662
666
  log_dir: str) -> provision_common.ClusterInfo:
663
667
  """Run internal setup commands after provisioning and before user setup.
@@ -675,11 +679,12 @@ def post_provision_runtime_setup(
675
679
  with provision_logging.setup_provision_logging(log_dir):
676
680
  try:
677
681
  logger.debug(_TITLE.format('System Setup After Provision'))
678
- return _post_provision_setup(cloud_name,
679
- cluster_name,
680
- cluster_yaml=cluster_yaml,
681
- provision_record=provision_record,
682
- custom_resource=custom_resource)
682
+ return _post_provision_setup(
683
+ cloud_name,
684
+ cluster_name,
685
+ handle_cluster_yaml=handle_cluster_yaml,
686
+ provision_record=provision_record,
687
+ custom_resource=custom_resource)
683
688
  except Exception: # pylint: disable=broad-except
684
689
  logger.error(
685
690
  ux_utils.error_message(
sky/resources.py CHANGED
@@ -98,7 +98,7 @@ class Resources:
98
98
  """
99
99
  # If any fields changed, increment the version. For backward compatibility,
100
100
  # modify the __setstate__ method to handle the old version.
101
- _VERSION = 25
101
+ _VERSION = 26
102
102
 
103
103
  def __init__(
104
104
  self,
@@ -117,6 +117,7 @@ class Resources:
117
117
  image_id: Union[Dict[Optional[str], str], str, None] = None,
118
118
  disk_size: Optional[int] = None,
119
119
  disk_tier: Optional[Union[str, resources_utils.DiskTier]] = None,
120
+ network_tier: Optional[Union[str, resources_utils.NetworkTier]] = None,
120
121
  ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
121
122
  labels: Optional[Dict[str, str]] = None,
122
123
  autostop: Union[bool, int, Dict[str, Any], None] = None,
@@ -202,6 +203,8 @@ class Resources:
202
203
  disk_size: the size of the OS disk in GiB.
203
204
  disk_tier: the disk performance tier to use. If None, defaults to
204
205
  ``'medium'``.
206
+ network_tier: the network performance tier to use. If None, defaults to
207
+ ``'standard'``.
205
208
  ports: the ports to open on the instance.
206
209
  labels: the labels to apply to the instance. These are useful for
207
210
  assigning metadata that may be used by external tools.
@@ -309,6 +312,20 @@ class Resources:
309
312
  disk_tier = resources_utils.DiskTier(disk_tier_str)
310
313
  self._disk_tier = disk_tier
311
314
 
315
+ if isinstance(network_tier, str):
316
+ network_tier_str = str(network_tier).lower()
317
+ supported_tiers = [
318
+ tier.value for tier in resources_utils.NetworkTier
319
+ ]
320
+ if network_tier_str not in supported_tiers:
321
+ with ux_utils.print_exception_no_traceback():
322
+ raise ValueError(
323
+ f'Invalid network_tier {network_tier_str!r}. '
324
+ f'Network tier must be one of '
325
+ f'{", ".join(supported_tiers)}.')
326
+ network_tier = resources_utils.NetworkTier(network_tier_str)
327
+ self._network_tier = network_tier
328
+
312
329
  if ports is not None:
313
330
  if isinstance(ports, tuple):
314
331
  ports = list(ports)
@@ -418,6 +435,10 @@ class Resources:
418
435
  if self.disk_tier is not None:
419
436
  disk_tier = f', disk_tier={self.disk_tier.value}'
420
437
 
438
+ network_tier = ''
439
+ if self.network_tier is not None:
440
+ network_tier = f', network_tier={self.network_tier.value}'
441
+
421
442
  disk_size = ''
422
443
  if self.disk_size != _DEFAULT_DISK_SIZE_GB:
423
444
  disk_size = f', disk_size={self.disk_size}'
@@ -437,7 +458,7 @@ class Resources:
437
458
  hardware_str = (
438
459
  f'{instance_type}{use_spot}'
439
460
  f'{cpus}{memory}{accelerators}{accelerator_args}{image_id}'
440
- f'{disk_tier}{disk_size}{ports}')
461
+ f'{disk_tier}{network_tier}{disk_size}{ports}')
441
462
  # It may have leading ',' (for example, instance_type not set) or empty
442
463
  # spaces. Remove them.
443
464
  while hardware_str and hardware_str[0] in (',', ' '):
@@ -567,6 +588,10 @@ class Resources:
567
588
  def disk_tier(self) -> Optional[resources_utils.DiskTier]:
568
589
  return self._disk_tier
569
590
 
591
+ @property
592
+ def network_tier(self) -> Optional[resources_utils.NetworkTier]:
593
+ return self._network_tier
594
+
570
595
  @property
571
596
  def ports(self) -> Optional[List[str]]:
572
597
  return self._ports
@@ -1223,7 +1248,6 @@ class Resources:
1223
1248
 
1224
1249
  def _try_validate_volumes(self) -> None:
1225
1250
  """Try to validate the volumes attribute.
1226
-
1227
1251
  Raises:
1228
1252
  ValueError: if the attribute is invalid.
1229
1253
  """
@@ -1532,6 +1556,12 @@ class Resources:
1532
1556
  if not (self.disk_tier <= other.disk_tier): # pylint: disable=superfluous-parens
1533
1557
  return False
1534
1558
 
1559
+ if self.network_tier is not None:
1560
+ if other.network_tier is None:
1561
+ return False
1562
+ if not self.network_tier <= other.network_tier:
1563
+ return False
1564
+
1535
1565
  if check_ports:
1536
1566
  if self.ports is not None:
1537
1567
  if other.ports is None:
@@ -1586,6 +1616,7 @@ class Resources:
1586
1616
  not self._use_spot_specified,
1587
1617
  self._disk_size == _DEFAULT_DISK_SIZE_GB,
1588
1618
  self._disk_tier is None,
1619
+ self._network_tier is None,
1589
1620
  self._image_id is None,
1590
1621
  self._ports is None,
1591
1622
  self._docker_login_config is None,
@@ -1629,6 +1660,7 @@ class Resources:
1629
1660
  zone=override.pop('zone', self.zone),
1630
1661
  image_id=override.pop('image_id', self.image_id),
1631
1662
  disk_tier=override.pop('disk_tier', self.disk_tier),
1663
+ network_tier=override.pop('network_tier', self.network_tier),
1632
1664
  ports=override.pop('ports', self.ports),
1633
1665
  labels=override.pop('labels', self.labels),
1634
1666
  autostop=override.pop('autostop', current_autostop_config),
@@ -1667,6 +1699,9 @@ class Resources:
1667
1699
  if (self.disk_tier is not None and
1668
1700
  self.disk_tier != resources_utils.DiskTier.BEST):
1669
1701
  features.add(clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER)
1702
+ if (self.network_tier is not None and
1703
+ self.network_tier == resources_utils.NetworkTier.BEST):
1704
+ features.add(clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER)
1670
1705
  if self.extract_docker_image() is not None:
1671
1706
  features.add(clouds.CloudImplementationFeatures.DOCKER_IMAGE)
1672
1707
  elif self.image_id is not None:
@@ -1845,6 +1880,7 @@ class Resources:
1845
1880
  resources_fields['disk_size'] = config.pop('disk_size', None)
1846
1881
  resources_fields['image_id'] = config.pop('image_id', None)
1847
1882
  resources_fields['disk_tier'] = config.pop('disk_tier', None)
1883
+ resources_fields['network_tier'] = config.pop('network_tier', None)
1848
1884
  resources_fields['ports'] = config.pop('ports', None)
1849
1885
  resources_fields['labels'] = config.pop('labels', None)
1850
1886
  resources_fields['autostop'] = config.pop('autostop', None)
@@ -1897,6 +1933,8 @@ class Resources:
1897
1933
  add_if_not_none('image_id', self.image_id)
1898
1934
  if self.disk_tier is not None:
1899
1935
  config['disk_tier'] = self.disk_tier.value
1936
+ if self.network_tier is not None:
1937
+ config['network_tier'] = self.network_tier.value
1900
1938
  add_if_not_none('ports', self.ports)
1901
1939
  add_if_not_none('labels', self.labels)
1902
1940
  if self.volumes is not None:
@@ -2081,6 +2119,9 @@ class Resources:
2081
2119
  if isinstance(state.get('_cloud', None), clouds.Kubernetes):
2082
2120
  _maybe_add_docker_prefix_to_image_id(state['_image_id'])
2083
2121
 
2122
+ if version < 26:
2123
+ self._network_tier = state.get('_network_tier', None)
2124
+
2084
2125
  self.__dict__.update(state)
2085
2126
 
2086
2127
 
sky/serve/controller.py CHANGED
@@ -42,12 +42,13 @@ class SkyServeController:
42
42
  """
43
43
 
44
44
  def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec,
45
- task_yaml: str, host: str, port: int) -> None:
45
+ service_task_yaml: str, host: str, port: int) -> None:
46
46
  self._service_name = service_name
47
47
  self._replica_manager: replica_managers.ReplicaManager = (
48
- replica_managers.SkyPilotReplicaManager(service_name=service_name,
49
- spec=service_spec,
50
- task_yaml_path=task_yaml))
48
+ replica_managers.SkyPilotReplicaManager(
49
+ service_name=service_name,
50
+ spec=service_spec,
51
+ service_task_yaml_path=service_task_yaml))
51
52
  self._autoscaler: autoscalers.Autoscaler = (
52
53
  autoscalers.Autoscaler.from_spec(service_name, service_spec))
53
54
  self._host = host
@@ -240,7 +241,9 @@ class SkyServeController:
240
241
  # TODO(tian): Probably we should support service that will stop the VM in
241
242
  # specific time period.
242
243
  def run_controller(service_name: str, service_spec: serve.SkyServiceSpec,
243
- task_yaml: str, controller_host: str, controller_port: int):
244
- controller = SkyServeController(service_name, service_spec, task_yaml,
245
- controller_host, controller_port)
244
+ service_task_yaml: str, controller_host: str,
245
+ controller_port: int):
246
+ controller = SkyServeController(service_name, service_spec,
247
+ service_task_yaml, controller_host,
248
+ controller_port)
246
249
  controller.run()
@@ -58,7 +58,7 @@ _MAX_NUM_LAUNCH = psutil.cpu_count() * 2
58
58
  # TODO(tian): Combine this with
59
59
  # sky/spot/recovery_strategy.py::StrategyExecutor::launch
60
60
  def launch_cluster(replica_id: int,
61
- task_yaml_path: str,
61
+ service_task_yaml_path: str,
62
62
  cluster_name: str,
63
63
  resources_override: Optional[Dict[str, Any]] = None,
64
64
  retry_until_up: bool = True,
@@ -78,7 +78,8 @@ def launch_cluster(replica_id: int,
78
78
  f'{cluster_name} with resources override: '
79
79
  f'{resources_override}')
80
80
  try:
81
- config = common_utils.read_yaml(os.path.expanduser(task_yaml_path))
81
+ config = common_utils.read_yaml(
82
+ os.path.expanduser(service_task_yaml_path))
82
83
  task = sky.Task.from_yaml_config(config)
83
84
  if resources_override is not None:
84
85
  resources = task.resources
@@ -173,9 +174,9 @@ def terminate_cluster(cluster_name: str,
173
174
  time.sleep(gap_seconds)
174
175
 
175
176
 
176
- def _get_resources_ports(task_yaml: str) -> str:
177
+ def _get_resources_ports(service_task_yaml_path: str) -> str:
177
178
  """Get the resources ports used by the task."""
178
- task = sky.Task.from_yaml(task_yaml)
179
+ task = sky.Task.from_yaml(service_task_yaml_path)
179
180
  # Already checked all ports are valid in sky.serve.core.up
180
181
  assert task.resources, task
181
182
  assert task.service is not None, task
@@ -183,7 +184,7 @@ def _get_resources_ports(task_yaml: str) -> str:
183
184
  return task.service.ports
184
185
 
185
186
 
186
- def _should_use_spot(task_yaml: str,
187
+ def _should_use_spot(service_task_yaml_path: str,
187
188
  resource_override: Optional[Dict[str, Any]]) -> bool:
188
189
  """Get whether the task should use spot."""
189
190
  if resource_override is not None:
@@ -191,7 +192,7 @@ def _should_use_spot(task_yaml: str,
191
192
  if use_spot_override is not None:
192
193
  assert isinstance(use_spot_override, bool)
193
194
  return use_spot_override
194
- task = sky.Task.from_yaml(task_yaml)
195
+ task = sky.Task.from_yaml(service_task_yaml_path)
195
196
  spot_use_resources = [
196
197
  resources for resources in task.resources if resources.use_spot
197
198
  ]
@@ -634,10 +635,10 @@ class SkyPilotReplicaManager(ReplicaManager):
634
635
  """
635
636
 
636
637
  def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
637
- task_yaml_path: str) -> None:
638
+ service_task_yaml_path: str) -> None:
638
639
  super().__init__(service_name, spec)
639
- self._task_yaml_path = task_yaml_path
640
- task = sky.Task.from_yaml(task_yaml_path)
640
+ self.service_task_yaml_path = service_task_yaml_path
641
+ task = sky.Task.from_yaml(service_task_yaml_path)
641
642
  self._spot_placer: Optional[spot_placer.SpotPlacer] = (
642
643
  spot_placer.SpotPlacer.from_task(spec, task))
643
644
  # TODO(tian): Store launch/down pid in the replica table, to make the
@@ -714,7 +715,8 @@ class SkyPilotReplicaManager(ReplicaManager):
714
715
  self._service_name, replica_id)
715
716
  log_file_name = serve_utils.generate_replica_launch_log_file_name(
716
717
  self._service_name, replica_id)
717
- use_spot = _should_use_spot(self._task_yaml_path, resources_override)
718
+ use_spot = _should_use_spot(self.service_task_yaml_path,
719
+ resources_override)
718
720
  retry_until_up = True
719
721
  location = None
720
722
  if use_spot and self._spot_placer is not None:
@@ -742,10 +744,10 @@ class SkyPilotReplicaManager(ReplicaManager):
742
744
  launch_cluster,
743
745
  log_file_name,
744
746
  ).run,
745
- args=(replica_id, self._task_yaml_path, cluster_name,
747
+ args=(replica_id, self.service_task_yaml_path, cluster_name,
746
748
  resources_override, retry_until_up),
747
749
  )
748
- replica_port = _get_resources_ports(self._task_yaml_path)
750
+ replica_port = _get_resources_ports(self.service_task_yaml_path)
749
751
 
750
752
  info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
751
753
  location, self.latest_version, resources_override)
@@ -1290,11 +1292,11 @@ class SkyPilotReplicaManager(ReplicaManager):
1290
1292
  logger.error(f'Invalid version: {version}, '
1291
1293
  f'latest version: {self.latest_version}')
1292
1294
  return
1293
- task_yaml_path = serve_utils.generate_task_yaml_file_name(
1295
+ service_task_yaml_path = serve_utils.generate_task_yaml_file_name(
1294
1296
  self._service_name, version)
1295
1297
  serve_state.add_or_update_version(self._service_name, version, spec)
1296
1298
  self.latest_version = version
1297
- self._task_yaml_path = task_yaml_path
1299
+ self.service_task_yaml_path = service_task_yaml_path
1298
1300
  self._update_mode = update_mode
1299
1301
 
1300
1302
  # Reuse all replicas that have the same config as the new version
@@ -1302,7 +1304,8 @@ class SkyPilotReplicaManager(ReplicaManager):
1302
1304
  # the latest version. This can significantly improve the speed
1303
1305
  # for updating an existing service with only config changes to the
1304
1306
  # service specs, e.g. scale down the service.
1305
- new_config = common_utils.read_yaml(os.path.expanduser(task_yaml_path))
1307
+ new_config = common_utils.read_yaml(
1308
+ os.path.expanduser(service_task_yaml_path))
1306
1309
  # Always create new replicas and scale down old ones when file_mounts
1307
1310
  # are not empty.
1308
1311
  if new_config.get('file_mounts', None) != {}:
@@ -1313,10 +1316,11 @@ class SkyPilotReplicaManager(ReplicaManager):
1313
1316
  for info in replica_infos:
1314
1317
  if info.version < version and not info.is_terminal:
1315
1318
  # Assume user does not change the yaml file on the controller.
1316
- old_task_yaml_path = serve_utils.generate_task_yaml_file_name(
1317
- self._service_name, info.version)
1319
+ old_service_task_yaml_path = (
1320
+ serve_utils.generate_task_yaml_file_name(
1321
+ self._service_name, info.version))
1318
1322
  old_config = common_utils.read_yaml(
1319
- os.path.expanduser(old_task_yaml_path))
1323
+ os.path.expanduser(old_service_task_yaml_path))
1320
1324
  for key in ['service']:
1321
1325
  old_config.pop(key)
1322
1326
  # Bump replica version if all fields except for service are
sky/serve/service.py CHANGED
@@ -186,7 +186,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
186
186
 
187
187
  service_dir = os.path.expanduser(
188
188
  serve_utils.generate_remote_service_dir_name(service_name))
189
- task_yaml = serve_utils.generate_task_yaml_file_name(service_name, version)
189
+ service_task_yaml = serve_utils.generate_task_yaml_file_name(
190
+ service_name, version)
190
191
 
191
192
  if not is_recovery:
192
193
  if (len(serve_state.get_services()) >=
@@ -218,7 +219,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
218
219
  # don't want the new file mounts to overwrite the old one, so we
219
220
  # sync to a tmp file first and then copy it to the final name
220
221
  # if there is no name conflict.
221
- shutil.copy(tmp_task_yaml, task_yaml)
222
+ shutil.copy(tmp_task_yaml, service_task_yaml)
222
223
 
223
224
  controller_process = None
224
225
  load_balancer_process = None
@@ -249,8 +250,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
249
250
  controller_host = _get_controller_host()
250
251
  controller_process = multiprocessing.Process(
251
252
  target=controller.run_controller,
252
- args=(service_name, service_spec, task_yaml, controller_host,
253
- controller_port))
253
+ args=(service_name, service_spec, service_task_yaml,
254
+ controller_host, controller_port))
254
255
  controller_process.start()
255
256
 
256
257
  if not is_recovery:
sky/server/common.py CHANGED
@@ -533,10 +533,13 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
533
533
  api_server_status = None
534
534
  try:
535
535
  api_server_status = check_server_healthy()
536
+ if api_server_status == ApiServerStatus.NEEDS_AUTH:
537
+ endpoint = get_server_url()
538
+ with ux_utils.print_exception_no_traceback():
539
+ raise exceptions.ApiServerAuthenticationError(endpoint)
536
540
  except exceptions.ApiServerConnectionError as exc:
537
541
  endpoint = get_server_url()
538
- if (not is_api_server_local() or
539
- api_server_status == ApiServerStatus.NEEDS_AUTH):
542
+ if not is_api_server_local():
540
543
  with ux_utils.print_exception_no_traceback():
541
544
  raise exceptions.ApiServerConnectionError(endpoint) from exc
542
545
  # Lock to prevent multiple processes from starting the server at the
sky/server/constants.py CHANGED
@@ -7,7 +7,7 @@ from sky.skylet import constants
7
7
  # API server version, whenever there is a change in API server that requires a
8
8
  # restart of the local API server or error out when the client does not match
9
9
  # the server version.
10
- API_VERSION = '6'
10
+ API_VERSION = '7'
11
11
 
12
12
  # Prefix for API request names.
13
13
  REQUEST_NAME_PREFIX = 'sky.'
@@ -376,6 +376,7 @@ class JobsLogsBody(RequestBody):
376
376
  follow: bool = True
377
377
  controller: bool = False
378
378
  refresh: bool = False
379
+ tail: Optional[int] = None
379
380
 
380
381
 
381
382
  class RequestCancelBody(RequestBody):
@@ -15,6 +15,8 @@ from sky.utils import rich_utils
15
15
 
16
16
  logger = sky_logging.init_logger(__name__)
17
17
 
18
+ _HEARTBEAT_INTERVAL = 30
19
+
18
20
 
19
21
  async def _yield_log_file_with_payloads_skipped(
20
22
  log_file) -> AsyncGenerator[str, None]:
@@ -90,6 +92,8 @@ async def log_streamer(request_id: Optional[str],
90
92
  for line_str in lines:
91
93
  yield line_str
92
94
 
95
+ last_heartbeat_time = asyncio.get_event_loop().time()
96
+
93
97
  while True:
94
98
  # Sleep 0 to yield control to allow other coroutines to run,
95
99
  # while keeps the loop tight to make log stream responsive.
@@ -106,15 +110,32 @@ async def log_streamer(request_id: Optional[str],
106
110
  break
107
111
  if not follow:
108
112
  break
113
+
114
+ current_time = asyncio.get_event_loop().time()
115
+ if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
116
+ # Currently just used to keep the connection busy, refer to
117
+ # https://github.com/skypilot-org/skypilot/issues/5750 for
118
+ # more details.
119
+ yield message_utils.encode_payload(
120
+ rich_utils.Control.HEARTBEAT.encode(''))
121
+ last_heartbeat_time = current_time
122
+
109
123
  # Sleep shortly to avoid storming the DB and CPU, this has
110
124
  # little impact on the responsivness here since we are waiting
111
125
  # for a new line to come in.
112
126
  await asyncio.sleep(0.1)
113
127
  continue
128
+
129
+ # Refresh the heartbeat time, this is a trivial optimization for
130
+ # performance but it helps avoid unnecessary heartbeat strings
131
+ # being printed when the client runs in an old version.
132
+ last_heartbeat_time = asyncio.get_event_loop().time()
114
133
  line_str = line.decode('utf-8')
115
134
  if plain_logs:
116
135
  is_payload, line_str = message_utils.decode_payload(
117
136
  line_str, raise_for_mismatch=False)
137
+ # TODO(aylei): implement heartbeat mechanism for plain logs,
138
+ # sending invisible characters might be okay.
118
139
  if is_payload:
119
140
  continue
120
141
  yield line_str
@@ -395,6 +395,13 @@ available_node_types:
395
395
  # STEP 1: Run apt update, install missing packages, and set up ssh.
396
396
  (
397
397
  (
398
+ # For backwards compatibility, we put a marker file in the pod
399
+ # to indicate that the apt ssh setup step will write a completion
400
+ # marker file (/tmp/apt_ssh_setup_complete) to the pod.
401
+ # TODO: Remove this marker file and its usage in setup_commands
402
+ # after v0.11.0 release.
403
+ touch /tmp/apt_ssh_setup_started
404
+
398
405
  DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
399
406
  echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
400
407
  # Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
@@ -402,7 +409,7 @@ available_node_types:
402
409
  PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
403
410
 
404
411
  # Separate packages into two groups: packages that are installed first
405
- # so that curl, rsync and wget are available sooner to unblock the following
412
+ # so that curl, rsync, ssh and wget are available sooner to unblock the following
406
413
  # conda installation and rsync.
407
414
  # Also, we install fuse first to avoid confliction with fuse3.
408
415
  set -e
@@ -494,6 +501,8 @@ available_node_types:
494
501
  $(prefix_cmd) service ssh restart;
495
502
  $(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
496
503
 
504
+ touch /tmp/apt_ssh_setup_complete
505
+ echo "=== SSH setup completed ==="
497
506
  ) > /tmp/${STEPS[0]}.log 2>&1 || {
498
507
  echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
499
508
  cat /tmp/${STEPS[0]}.log
@@ -688,6 +697,13 @@ available_node_types:
688
697
  {{k8s_resource_key}}: {{accelerator_count}}
689
698
  {% endif %}
690
699
  {% endif %}
700
+ {% if k8s_ipc_lock_capability %}
701
+ securityContext:
702
+ capabilities:
703
+ add:
704
+ - IPC_LOCK
705
+ {% endif %}
706
+
691
707
 
692
708
  {% if high_availability %}
693
709
  pvc_spec:
@@ -791,6 +807,15 @@ setup_commands:
791
807
  {%- endfor %}
792
808
  STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
793
809
  start_epoch=$(date +%s);
810
+
811
+ # Wait for SSH setup to complete before proceeding
812
+ if [ -f /tmp/apt_ssh_setup_started ]; then
813
+ echo "=== Logs for asynchronous SSH setup ===";
814
+ [ -f /tmp/apt_ssh_setup_complete ] && cat /tmp/${STEPS[0]}.log ||
815
+ { tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/apt_ssh_setup_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
816
+ [ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
817
+ fi
818
+
794
819
  echo "=== Logs for asynchronous ray and skypilot installation ===";
795
820
  if [ -f /tmp/skypilot_is_nimbus ]; then
796
821
  echo "=== Logs for asynchronous ray and skypilot installation ===";
sky/utils/common_utils.py CHANGED
@@ -324,9 +324,75 @@ def get_pretty_entrypoint_cmd() -> str:
324
324
  # Turn '/.../anaconda/envs/py36/bin/sky' into 'sky', but keep other
325
325
  # things like 'examples/app.py'.
326
326
  argv[0] = basename
327
+
328
+ # Redact sensitive environment variable values
329
+ argv = _redact_env_values(argv)
330
+
327
331
  return ' '.join(argv)
328
332
 
329
333
 
334
+ def _redact_env_values(argv: List[str]) -> List[str]:
335
+ """Redact sensitive values from --env arguments.
336
+
337
+ Args:
338
+ argv: Command line arguments
339
+
340
+ Returns:
341
+ Modified argv with redacted --env values, or original argv if any error
342
+
343
+ Examples:
344
+ ['sky', 'launch', '--env', 'HF_TOKEN=secret'] ->
345
+ ['sky', 'launch', '--env', 'HF_TOKEN=<redacted>']
346
+
347
+ ['sky', 'launch', '--env=HF_TOKEN=secret'] ->
348
+ ['sky', 'launch', '--env=HF_TOKEN=<redacted>']
349
+
350
+ ['sky', 'launch', '--env', 'HF_TOKEN'] ->
351
+ ['sky', 'launch', '--env', 'HF_TOKEN'] (no change)
352
+ """
353
+ try:
354
+ if not argv:
355
+ return argv or []
356
+
357
+ result = []
358
+ i = 0
359
+
360
+ while i < len(argv):
361
+ arg = argv[i]
362
+
363
+ # Ensure arg is a string
364
+ if not isinstance(arg, str):
365
+ result.append(arg)
366
+ i += 1
367
+ continue
368
+
369
+ if arg == '--env' and i + 1 < len(argv):
370
+ result.append(arg)
371
+ next_arg = argv[i + 1]
372
+ # Ensure next_arg is a string and handle redaction safely
373
+ if isinstance(next_arg, str):
374
+ redacted = re.sub(r'^([^=]+)=.*', r'\1=<redacted>',
375
+ next_arg)
376
+ result.append(redacted)
377
+ else:
378
+ result.append(next_arg)
379
+ i += 2
380
+ elif arg.startswith('--env='):
381
+ # Redact only if there's a value after the key
382
+ redacted = re.sub(r'^(--env=[^=]+)=.*', r'\1=<redacted>', arg)
383
+ result.append(redacted)
384
+ i += 1
385
+ else:
386
+ result.append(arg)
387
+ i += 1
388
+
389
+ return result
390
+ except Exception: # pylint: disable=broad-except
391
+ # If anything goes wrong with redaction, return original argv
392
+ # This ensures the command can still execute
393
+ return argv or []
394
+
395
+
330
396
  def user_and_hostname_hash() -> str:
331
397
  """Returns a string containing <user>-<hostname hash last 4 chars>.
332
398
 
@@ -50,6 +50,32 @@ class DiskTier(enum.Enum):
50
50
  return types.index(self) <= types.index(other)
51
51
 
52
52
 
53
+ class NetworkTier(enum.Enum):
54
+ """All network tiers supported by SkyPilot."""
55
+ STANDARD = 'standard'
56
+ BEST = 'best'
57
+
58
+ @classmethod
59
+ def supported_tiers(cls) -> List[str]:
60
+ return [tier.value for tier in cls]
61
+
62
+ @classmethod
63
+ def cli_help_message(cls) -> str:
64
+ return (
65
+ f'Network tier. Could be one of {", ".join(cls.supported_tiers())}'
66
+ f'. Default: {cls.STANDARD.value}')
67
+
68
+ @classmethod
69
+ def from_str(cls, tier: str) -> 'NetworkTier':
70
+ if tier not in cls.supported_tiers():
71
+ raise ValueError(f'Invalid network tier: {tier}')
72
+ return cls(tier)
73
+
74
+ def __le__(self, other: 'NetworkTier') -> bool:
75
+ types = list(NetworkTier)
76
+ return types.index(self) <= types.index(other)
77
+
78
+
53
79
  class StorageType(enum.Enum):
54
80
  """Storage type."""
55
81
  # Durable network storage, e.g. GCP persistent disks