skypilot-nightly 1.0.0.dev20250529__py3-none-any.whl → 1.0.0.dev20250531__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +99 -16
- sky/authentication.py +54 -7
- sky/backends/backend_utils.py +37 -24
- sky/backends/cloud_vm_ray_backend.py +33 -17
- sky/check.py +1 -1
- sky/cli.py +43 -15
- sky/client/cli.py +43 -15
- sky/clouds/cloud.py +20 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +3 -0
- sky/clouds/fluidstack.py +3 -0
- sky/clouds/gcp.py +10 -3
- sky/clouds/kubernetes.py +70 -4
- sky/clouds/lambda_cloud.py +3 -0
- sky/clouds/nebius.py +57 -14
- sky/clouds/paperspace.py +3 -0
- sky/clouds/runpod.py +2 -0
- sky/clouds/scp.py +3 -0
- sky/clouds/vast.py +3 -0
- sky/clouds/vsphere.py +3 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/bdeJWb62qu7L7FOq1dbXX/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-7458fda7b295f305.js +6 -0
- sky/dashboard/out/_next/static/chunks/37-b638675d511d58b4.js +6 -0
- sky/dashboard/out/_next/static/chunks/{470-4d003c441839094d.js → 470-9e7a479cc8303baa.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{173-7db8607cefc20f70.js → 614-3d29f98e0634b179.js} +2 -2
- sky/dashboard/out/_next/static/chunks/682-5c12535476a21ce3.js +6 -0
- sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +1 -0
- sky/dashboard/out/_next/static/chunks/843-786c36624d5ff61f.js +11 -0
- sky/dashboard/out/_next/static/chunks/856-ab9627e7e8ac35e8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f270e2c9c59fa1a.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-25edb867a41b6b20.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-943992b84fd6f4ee.js → clusters-f37ff20f0af29aae.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-7c48919fe030bc43.js → config-3c6a2dabf56e8cd6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-909f1ceb0fcf1b99.js → [context]-342bc15bb78ab2e5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-d4c6875c88771e17.js → infra-7b4b8e7fa9fa0827.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-c0c1dff3cd463d9e.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a4efc09e61988f8d.js → jobs-78a6c5ba3e24c0cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-b2634885d67c49a6.js → users-89f9212b81d8897e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/{new-579b3203c7c19d84.js → new-198b6e00d7d724c5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-9388e38fac73ee8f.js → [name]-2ce792183b03c341.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-17d41826537196e7.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-f27c9a32aa3d9c6d.js +1 -0
- sky/dashboard/out/_next/static/css/2b3ee34e586949a3.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +10 -0
- sky/global_user_state.py +149 -1
- sky/jobs/client/sdk.py +3 -0
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +8 -3
- sky/jobs/state.py +24 -5
- sky/jobs/utils.py +34 -11
- sky/provision/gcp/config.py +3 -1
- sky/provision/gcp/constants.py +10 -0
- sky/provision/kubernetes/utils.py +2 -1
- sky/provision/provisioner.py +15 -10
- sky/resources.py +44 -3
- sky/serve/controller.py +10 -7
- sky/serve/replica_managers.py +22 -18
- sky/serve/service.py +5 -4
- sky/server/common.py +5 -2
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +1 -0
- sky/server/stream_utils.py +21 -0
- sky/templates/kubernetes-ray.yml.j2 +26 -1
- sky/utils/common_utils.py +66 -0
- sky/utils/resources_utils.py +26 -0
- sky/utils/rich_utils.py +5 -0
- sky/utils/schemas.py +23 -1
- {skypilot_nightly-1.0.0.dev20250529.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250529.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/RECORD +90 -91
- sky/dashboard/out/_next/static/HvNkg7hqKM1p0ptAcdDcF/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-90e5498a5b00ec29.js +0 -6
- sky/dashboard/out/_next/static/chunks/303-2c7b0f7af571710b.js +0 -6
- sky/dashboard/out/_next/static/chunks/320-afea3ddcc5bd1c6c.js +0 -6
- sky/dashboard/out/_next/static/chunks/578-9146658cead92981.js +0 -6
- sky/dashboard/out/_next/static/chunks/843-256ec920f6d5f41f.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-59a1760784c9e770.js +0 -1
- sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-159bffb2fa34ed54.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9506c00257d10dbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6b80e9e0c6aa16a1.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/workspaces-610c49ae3619ee85.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +0 -1
- sky/dashboard/out/_next/static/css/ffd1cd601648c303.css +0 -3
- /sky/dashboard/out/_next/static/{HvNkg7hqKM1p0ptAcdDcF → bdeJWb62qu7L7FOq1dbXX}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-a631df412d8172de.js → _app-ad1edd7fe17ea796.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250529.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250529.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250529.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250529.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/top_level.txt +0 -0
sky/provision/provisioner.py
CHANGED
@@ -15,6 +15,7 @@ import colorama
|
|
15
15
|
import sky
|
16
16
|
from sky import clouds
|
17
17
|
from sky import exceptions
|
18
|
+
from sky import global_user_state
|
18
19
|
from sky import provision
|
19
20
|
from sky import sky_logging
|
20
21
|
from sky import skypilot_config
|
@@ -118,7 +119,7 @@ def bulk_provision(
|
|
118
119
|
Cloud specific exceptions: If the provisioning process failed, cloud-
|
119
120
|
specific exceptions will be raised by the cloud APIs.
|
120
121
|
"""
|
121
|
-
original_config =
|
122
|
+
original_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
|
122
123
|
head_node_type = original_config['head_node_type']
|
123
124
|
bootstrap_config = provision_common.ProvisionConfig(
|
124
125
|
provider_config=original_config['provider'],
|
@@ -413,9 +414,11 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
|
413
414
|
|
414
415
|
def _post_provision_setup(
|
415
416
|
cloud_name: str, cluster_name: resources_utils.ClusterName,
|
416
|
-
|
417
|
+
handle_cluster_yaml: str,
|
418
|
+
provision_record: provision_common.ProvisionRecord,
|
417
419
|
custom_resource: Optional[str]) -> provision_common.ClusterInfo:
|
418
|
-
config_from_yaml =
|
420
|
+
config_from_yaml = global_user_state.get_cluster_yaml_dict(
|
421
|
+
handle_cluster_yaml)
|
419
422
|
provider_config = config_from_yaml.get('provider')
|
420
423
|
cluster_info = provision.get_cluster_info(cloud_name,
|
421
424
|
provision_record.region,
|
@@ -446,7 +449,7 @@ def _post_provision_setup(
|
|
446
449
|
# TODO(suquark): Move wheel build here in future PRs.
|
447
450
|
# We don't set docker_user here, as we are configuring the VM itself.
|
448
451
|
ssh_credentials = backend_utils.ssh_credential_from_yaml(
|
449
|
-
|
452
|
+
handle_cluster_yaml, ssh_user=cluster_info.ssh_user)
|
450
453
|
docker_config = config_from_yaml.get('docker', {})
|
451
454
|
|
452
455
|
with rich_utils.safe_status(
|
@@ -657,7 +660,8 @@ def _post_provision_setup(
|
|
657
660
|
@timeline.event
|
658
661
|
def post_provision_runtime_setup(
|
659
662
|
cloud_name: str, cluster_name: resources_utils.ClusterName,
|
660
|
-
|
663
|
+
handle_cluster_yaml: str,
|
664
|
+
provision_record: provision_common.ProvisionRecord,
|
661
665
|
custom_resource: Optional[str],
|
662
666
|
log_dir: str) -> provision_common.ClusterInfo:
|
663
667
|
"""Run internal setup commands after provisioning and before user setup.
|
@@ -675,11 +679,12 @@ def post_provision_runtime_setup(
|
|
675
679
|
with provision_logging.setup_provision_logging(log_dir):
|
676
680
|
try:
|
677
681
|
logger.debug(_TITLE.format('System Setup After Provision'))
|
678
|
-
return _post_provision_setup(
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
682
|
+
return _post_provision_setup(
|
683
|
+
cloud_name,
|
684
|
+
cluster_name,
|
685
|
+
handle_cluster_yaml=handle_cluster_yaml,
|
686
|
+
provision_record=provision_record,
|
687
|
+
custom_resource=custom_resource)
|
683
688
|
except Exception: # pylint: disable=broad-except
|
684
689
|
logger.error(
|
685
690
|
ux_utils.error_message(
|
sky/resources.py
CHANGED
@@ -98,7 +98,7 @@ class Resources:
|
|
98
98
|
"""
|
99
99
|
# If any fields changed, increment the version. For backward compatibility,
|
100
100
|
# modify the __setstate__ method to handle the old version.
|
101
|
-
_VERSION =
|
101
|
+
_VERSION = 26
|
102
102
|
|
103
103
|
def __init__(
|
104
104
|
self,
|
@@ -117,6 +117,7 @@ class Resources:
|
|
117
117
|
image_id: Union[Dict[Optional[str], str], str, None] = None,
|
118
118
|
disk_size: Optional[int] = None,
|
119
119
|
disk_tier: Optional[Union[str, resources_utils.DiskTier]] = None,
|
120
|
+
network_tier: Optional[Union[str, resources_utils.NetworkTier]] = None,
|
120
121
|
ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
|
121
122
|
labels: Optional[Dict[str, str]] = None,
|
122
123
|
autostop: Union[bool, int, Dict[str, Any], None] = None,
|
@@ -202,6 +203,8 @@ class Resources:
|
|
202
203
|
disk_size: the size of the OS disk in GiB.
|
203
204
|
disk_tier: the disk performance tier to use. If None, defaults to
|
204
205
|
``'medium'``.
|
206
|
+
network_tier: the network performance tier to use. If None, defaults to
|
207
|
+
``'standard'``.
|
205
208
|
ports: the ports to open on the instance.
|
206
209
|
labels: the labels to apply to the instance. These are useful for
|
207
210
|
assigning metadata that may be used by external tools.
|
@@ -309,6 +312,20 @@ class Resources:
|
|
309
312
|
disk_tier = resources_utils.DiskTier(disk_tier_str)
|
310
313
|
self._disk_tier = disk_tier
|
311
314
|
|
315
|
+
if isinstance(network_tier, str):
|
316
|
+
network_tier_str = str(network_tier).lower()
|
317
|
+
supported_tiers = [
|
318
|
+
tier.value for tier in resources_utils.NetworkTier
|
319
|
+
]
|
320
|
+
if network_tier_str not in supported_tiers:
|
321
|
+
with ux_utils.print_exception_no_traceback():
|
322
|
+
raise ValueError(
|
323
|
+
f'Invalid network_tier {network_tier_str!r}. '
|
324
|
+
f'Network tier must be one of '
|
325
|
+
f'{", ".join(supported_tiers)}.')
|
326
|
+
network_tier = resources_utils.NetworkTier(network_tier_str)
|
327
|
+
self._network_tier = network_tier
|
328
|
+
|
312
329
|
if ports is not None:
|
313
330
|
if isinstance(ports, tuple):
|
314
331
|
ports = list(ports)
|
@@ -418,6 +435,10 @@ class Resources:
|
|
418
435
|
if self.disk_tier is not None:
|
419
436
|
disk_tier = f', disk_tier={self.disk_tier.value}'
|
420
437
|
|
438
|
+
network_tier = ''
|
439
|
+
if self.network_tier is not None:
|
440
|
+
network_tier = f', network_tier={self.network_tier.value}'
|
441
|
+
|
421
442
|
disk_size = ''
|
422
443
|
if self.disk_size != _DEFAULT_DISK_SIZE_GB:
|
423
444
|
disk_size = f', disk_size={self.disk_size}'
|
@@ -437,7 +458,7 @@ class Resources:
|
|
437
458
|
hardware_str = (
|
438
459
|
f'{instance_type}{use_spot}'
|
439
460
|
f'{cpus}{memory}{accelerators}{accelerator_args}{image_id}'
|
440
|
-
f'{disk_tier}{disk_size}{ports}')
|
461
|
+
f'{disk_tier}{network_tier}{disk_size}{ports}')
|
441
462
|
# It may have leading ',' (for example, instance_type not set) or empty
|
442
463
|
# spaces. Remove them.
|
443
464
|
while hardware_str and hardware_str[0] in (',', ' '):
|
@@ -567,6 +588,10 @@ class Resources:
|
|
567
588
|
def disk_tier(self) -> Optional[resources_utils.DiskTier]:
|
568
589
|
return self._disk_tier
|
569
590
|
|
591
|
+
@property
|
592
|
+
def network_tier(self) -> Optional[resources_utils.NetworkTier]:
|
593
|
+
return self._network_tier
|
594
|
+
|
570
595
|
@property
|
571
596
|
def ports(self) -> Optional[List[str]]:
|
572
597
|
return self._ports
|
@@ -1223,7 +1248,6 @@ class Resources:
|
|
1223
1248
|
|
1224
1249
|
def _try_validate_volumes(self) -> None:
|
1225
1250
|
"""Try to validate the volumes attribute.
|
1226
|
-
|
1227
1251
|
Raises:
|
1228
1252
|
ValueError: if the attribute is invalid.
|
1229
1253
|
"""
|
@@ -1532,6 +1556,12 @@ class Resources:
|
|
1532
1556
|
if not (self.disk_tier <= other.disk_tier): # pylint: disable=superfluous-parens
|
1533
1557
|
return False
|
1534
1558
|
|
1559
|
+
if self.network_tier is not None:
|
1560
|
+
if other.network_tier is None:
|
1561
|
+
return False
|
1562
|
+
if not self.network_tier <= other.network_tier:
|
1563
|
+
return False
|
1564
|
+
|
1535
1565
|
if check_ports:
|
1536
1566
|
if self.ports is not None:
|
1537
1567
|
if other.ports is None:
|
@@ -1586,6 +1616,7 @@ class Resources:
|
|
1586
1616
|
not self._use_spot_specified,
|
1587
1617
|
self._disk_size == _DEFAULT_DISK_SIZE_GB,
|
1588
1618
|
self._disk_tier is None,
|
1619
|
+
self._network_tier is None,
|
1589
1620
|
self._image_id is None,
|
1590
1621
|
self._ports is None,
|
1591
1622
|
self._docker_login_config is None,
|
@@ -1629,6 +1660,7 @@ class Resources:
|
|
1629
1660
|
zone=override.pop('zone', self.zone),
|
1630
1661
|
image_id=override.pop('image_id', self.image_id),
|
1631
1662
|
disk_tier=override.pop('disk_tier', self.disk_tier),
|
1663
|
+
network_tier=override.pop('network_tier', self.network_tier),
|
1632
1664
|
ports=override.pop('ports', self.ports),
|
1633
1665
|
labels=override.pop('labels', self.labels),
|
1634
1666
|
autostop=override.pop('autostop', current_autostop_config),
|
@@ -1667,6 +1699,9 @@ class Resources:
|
|
1667
1699
|
if (self.disk_tier is not None and
|
1668
1700
|
self.disk_tier != resources_utils.DiskTier.BEST):
|
1669
1701
|
features.add(clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER)
|
1702
|
+
if (self.network_tier is not None and
|
1703
|
+
self.network_tier == resources_utils.NetworkTier.BEST):
|
1704
|
+
features.add(clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER)
|
1670
1705
|
if self.extract_docker_image() is not None:
|
1671
1706
|
features.add(clouds.CloudImplementationFeatures.DOCKER_IMAGE)
|
1672
1707
|
elif self.image_id is not None:
|
@@ -1845,6 +1880,7 @@ class Resources:
|
|
1845
1880
|
resources_fields['disk_size'] = config.pop('disk_size', None)
|
1846
1881
|
resources_fields['image_id'] = config.pop('image_id', None)
|
1847
1882
|
resources_fields['disk_tier'] = config.pop('disk_tier', None)
|
1883
|
+
resources_fields['network_tier'] = config.pop('network_tier', None)
|
1848
1884
|
resources_fields['ports'] = config.pop('ports', None)
|
1849
1885
|
resources_fields['labels'] = config.pop('labels', None)
|
1850
1886
|
resources_fields['autostop'] = config.pop('autostop', None)
|
@@ -1897,6 +1933,8 @@ class Resources:
|
|
1897
1933
|
add_if_not_none('image_id', self.image_id)
|
1898
1934
|
if self.disk_tier is not None:
|
1899
1935
|
config['disk_tier'] = self.disk_tier.value
|
1936
|
+
if self.network_tier is not None:
|
1937
|
+
config['network_tier'] = self.network_tier.value
|
1900
1938
|
add_if_not_none('ports', self.ports)
|
1901
1939
|
add_if_not_none('labels', self.labels)
|
1902
1940
|
if self.volumes is not None:
|
@@ -2081,6 +2119,9 @@ class Resources:
|
|
2081
2119
|
if isinstance(state.get('_cloud', None), clouds.Kubernetes):
|
2082
2120
|
_maybe_add_docker_prefix_to_image_id(state['_image_id'])
|
2083
2121
|
|
2122
|
+
if version < 26:
|
2123
|
+
self._network_tier = state.get('_network_tier', None)
|
2124
|
+
|
2084
2125
|
self.__dict__.update(state)
|
2085
2126
|
|
2086
2127
|
|
sky/serve/controller.py
CHANGED
@@ -42,12 +42,13 @@ class SkyServeController:
|
|
42
42
|
"""
|
43
43
|
|
44
44
|
def __init__(self, service_name: str, service_spec: serve.SkyServiceSpec,
|
45
|
-
|
45
|
+
service_task_yaml: str, host: str, port: int) -> None:
|
46
46
|
self._service_name = service_name
|
47
47
|
self._replica_manager: replica_managers.ReplicaManager = (
|
48
|
-
replica_managers.SkyPilotReplicaManager(
|
49
|
-
|
50
|
-
|
48
|
+
replica_managers.SkyPilotReplicaManager(
|
49
|
+
service_name=service_name,
|
50
|
+
spec=service_spec,
|
51
|
+
service_task_yaml_path=service_task_yaml))
|
51
52
|
self._autoscaler: autoscalers.Autoscaler = (
|
52
53
|
autoscalers.Autoscaler.from_spec(service_name, service_spec))
|
53
54
|
self._host = host
|
@@ -240,7 +241,9 @@ class SkyServeController:
|
|
240
241
|
# TODO(tian): Probably we should support service that will stop the VM in
|
241
242
|
# specific time period.
|
242
243
|
def run_controller(service_name: str, service_spec: serve.SkyServiceSpec,
|
243
|
-
|
244
|
-
|
245
|
-
|
244
|
+
service_task_yaml: str, controller_host: str,
|
245
|
+
controller_port: int):
|
246
|
+
controller = SkyServeController(service_name, service_spec,
|
247
|
+
service_task_yaml, controller_host,
|
248
|
+
controller_port)
|
246
249
|
controller.run()
|
sky/serve/replica_managers.py
CHANGED
@@ -58,7 +58,7 @@ _MAX_NUM_LAUNCH = psutil.cpu_count() * 2
|
|
58
58
|
# TODO(tian): Combine this with
|
59
59
|
# sky/spot/recovery_strategy.py::StrategyExecutor::launch
|
60
60
|
def launch_cluster(replica_id: int,
|
61
|
-
|
61
|
+
service_task_yaml_path: str,
|
62
62
|
cluster_name: str,
|
63
63
|
resources_override: Optional[Dict[str, Any]] = None,
|
64
64
|
retry_until_up: bool = True,
|
@@ -78,7 +78,8 @@ def launch_cluster(replica_id: int,
|
|
78
78
|
f'{cluster_name} with resources override: '
|
79
79
|
f'{resources_override}')
|
80
80
|
try:
|
81
|
-
config = common_utils.read_yaml(
|
81
|
+
config = common_utils.read_yaml(
|
82
|
+
os.path.expanduser(service_task_yaml_path))
|
82
83
|
task = sky.Task.from_yaml_config(config)
|
83
84
|
if resources_override is not None:
|
84
85
|
resources = task.resources
|
@@ -173,9 +174,9 @@ def terminate_cluster(cluster_name: str,
|
|
173
174
|
time.sleep(gap_seconds)
|
174
175
|
|
175
176
|
|
176
|
-
def _get_resources_ports(
|
177
|
+
def _get_resources_ports(service_task_yaml_path: str) -> str:
|
177
178
|
"""Get the resources ports used by the task."""
|
178
|
-
task = sky.Task.from_yaml(
|
179
|
+
task = sky.Task.from_yaml(service_task_yaml_path)
|
179
180
|
# Already checked all ports are valid in sky.serve.core.up
|
180
181
|
assert task.resources, task
|
181
182
|
assert task.service is not None, task
|
@@ -183,7 +184,7 @@ def _get_resources_ports(task_yaml: str) -> str:
|
|
183
184
|
return task.service.ports
|
184
185
|
|
185
186
|
|
186
|
-
def _should_use_spot(
|
187
|
+
def _should_use_spot(service_task_yaml_path: str,
|
187
188
|
resource_override: Optional[Dict[str, Any]]) -> bool:
|
188
189
|
"""Get whether the task should use spot."""
|
189
190
|
if resource_override is not None:
|
@@ -191,7 +192,7 @@ def _should_use_spot(task_yaml: str,
|
|
191
192
|
if use_spot_override is not None:
|
192
193
|
assert isinstance(use_spot_override, bool)
|
193
194
|
return use_spot_override
|
194
|
-
task = sky.Task.from_yaml(
|
195
|
+
task = sky.Task.from_yaml(service_task_yaml_path)
|
195
196
|
spot_use_resources = [
|
196
197
|
resources for resources in task.resources if resources.use_spot
|
197
198
|
]
|
@@ -634,10 +635,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
634
635
|
"""
|
635
636
|
|
636
637
|
def __init__(self, service_name: str, spec: 'service_spec.SkyServiceSpec',
|
637
|
-
|
638
|
+
service_task_yaml_path: str) -> None:
|
638
639
|
super().__init__(service_name, spec)
|
639
|
-
self.
|
640
|
-
task = sky.Task.from_yaml(
|
640
|
+
self.service_task_yaml_path = service_task_yaml_path
|
641
|
+
task = sky.Task.from_yaml(service_task_yaml_path)
|
641
642
|
self._spot_placer: Optional[spot_placer.SpotPlacer] = (
|
642
643
|
spot_placer.SpotPlacer.from_task(spec, task))
|
643
644
|
# TODO(tian): Store launch/down pid in the replica table, to make the
|
@@ -714,7 +715,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
714
715
|
self._service_name, replica_id)
|
715
716
|
log_file_name = serve_utils.generate_replica_launch_log_file_name(
|
716
717
|
self._service_name, replica_id)
|
717
|
-
use_spot = _should_use_spot(self.
|
718
|
+
use_spot = _should_use_spot(self.service_task_yaml_path,
|
719
|
+
resources_override)
|
718
720
|
retry_until_up = True
|
719
721
|
location = None
|
720
722
|
if use_spot and self._spot_placer is not None:
|
@@ -742,10 +744,10 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
742
744
|
launch_cluster,
|
743
745
|
log_file_name,
|
744
746
|
).run,
|
745
|
-
args=(replica_id, self.
|
747
|
+
args=(replica_id, self.service_task_yaml_path, cluster_name,
|
746
748
|
resources_override, retry_until_up),
|
747
749
|
)
|
748
|
-
replica_port = _get_resources_ports(self.
|
750
|
+
replica_port = _get_resources_ports(self.service_task_yaml_path)
|
749
751
|
|
750
752
|
info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
|
751
753
|
location, self.latest_version, resources_override)
|
@@ -1290,11 +1292,11 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
1290
1292
|
logger.error(f'Invalid version: {version}, '
|
1291
1293
|
f'latest version: {self.latest_version}')
|
1292
1294
|
return
|
1293
|
-
|
1295
|
+
service_task_yaml_path = serve_utils.generate_task_yaml_file_name(
|
1294
1296
|
self._service_name, version)
|
1295
1297
|
serve_state.add_or_update_version(self._service_name, version, spec)
|
1296
1298
|
self.latest_version = version
|
1297
|
-
self.
|
1299
|
+
self.service_task_yaml_path = service_task_yaml_path
|
1298
1300
|
self._update_mode = update_mode
|
1299
1301
|
|
1300
1302
|
# Reuse all replicas that have the same config as the new version
|
@@ -1302,7 +1304,8 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
1302
1304
|
# the latest version. This can significantly improve the speed
|
1303
1305
|
# for updating an existing service with only config changes to the
|
1304
1306
|
# service specs, e.g. scale down the service.
|
1305
|
-
new_config = common_utils.read_yaml(
|
1307
|
+
new_config = common_utils.read_yaml(
|
1308
|
+
os.path.expanduser(service_task_yaml_path))
|
1306
1309
|
# Always create new replicas and scale down old ones when file_mounts
|
1307
1310
|
# are not empty.
|
1308
1311
|
if new_config.get('file_mounts', None) != {}:
|
@@ -1313,10 +1316,11 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
1313
1316
|
for info in replica_infos:
|
1314
1317
|
if info.version < version and not info.is_terminal:
|
1315
1318
|
# Assume user does not change the yaml file on the controller.
|
1316
|
-
|
1317
|
-
|
1319
|
+
old_service_task_yaml_path = (
|
1320
|
+
serve_utils.generate_task_yaml_file_name(
|
1321
|
+
self._service_name, info.version))
|
1318
1322
|
old_config = common_utils.read_yaml(
|
1319
|
-
os.path.expanduser(
|
1323
|
+
os.path.expanduser(old_service_task_yaml_path))
|
1320
1324
|
for key in ['service']:
|
1321
1325
|
old_config.pop(key)
|
1322
1326
|
# Bump replica version if all fields except for service are
|
sky/serve/service.py
CHANGED
@@ -186,7 +186,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
186
186
|
|
187
187
|
service_dir = os.path.expanduser(
|
188
188
|
serve_utils.generate_remote_service_dir_name(service_name))
|
189
|
-
|
189
|
+
service_task_yaml = serve_utils.generate_task_yaml_file_name(
|
190
|
+
service_name, version)
|
190
191
|
|
191
192
|
if not is_recovery:
|
192
193
|
if (len(serve_state.get_services()) >=
|
@@ -218,7 +219,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
218
219
|
# don't want the new file mounts to overwrite the old one, so we
|
219
220
|
# sync to a tmp file first and then copy it to the final name
|
220
221
|
# if there is no name conflict.
|
221
|
-
shutil.copy(tmp_task_yaml,
|
222
|
+
shutil.copy(tmp_task_yaml, service_task_yaml)
|
222
223
|
|
223
224
|
controller_process = None
|
224
225
|
load_balancer_process = None
|
@@ -249,8 +250,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
249
250
|
controller_host = _get_controller_host()
|
250
251
|
controller_process = multiprocessing.Process(
|
251
252
|
target=controller.run_controller,
|
252
|
-
args=(service_name, service_spec,
|
253
|
-
controller_port))
|
253
|
+
args=(service_name, service_spec, service_task_yaml,
|
254
|
+
controller_host, controller_port))
|
254
255
|
controller_process.start()
|
255
256
|
|
256
257
|
if not is_recovery:
|
sky/server/common.py
CHANGED
@@ -533,10 +533,13 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
|
|
533
533
|
api_server_status = None
|
534
534
|
try:
|
535
535
|
api_server_status = check_server_healthy()
|
536
|
+
if api_server_status == ApiServerStatus.NEEDS_AUTH:
|
537
|
+
endpoint = get_server_url()
|
538
|
+
with ux_utils.print_exception_no_traceback():
|
539
|
+
raise exceptions.ApiServerAuthenticationError(endpoint)
|
536
540
|
except exceptions.ApiServerConnectionError as exc:
|
537
541
|
endpoint = get_server_url()
|
538
|
-
if
|
539
|
-
api_server_status == ApiServerStatus.NEEDS_AUTH):
|
542
|
+
if not is_api_server_local():
|
540
543
|
with ux_utils.print_exception_no_traceback():
|
541
544
|
raise exceptions.ApiServerConnectionError(endpoint) from exc
|
542
545
|
# Lock to prevent multiple processes from starting the server at the
|
sky/server/constants.py
CHANGED
@@ -7,7 +7,7 @@ from sky.skylet import constants
|
|
7
7
|
# API server version, whenever there is a change in API server that requires a
|
8
8
|
# restart of the local API server or error out when the client does not match
|
9
9
|
# the server version.
|
10
|
-
API_VERSION = '
|
10
|
+
API_VERSION = '7'
|
11
11
|
|
12
12
|
# Prefix for API request names.
|
13
13
|
REQUEST_NAME_PREFIX = 'sky.'
|
sky/server/requests/payloads.py
CHANGED
sky/server/stream_utils.py
CHANGED
@@ -15,6 +15,8 @@ from sky.utils import rich_utils
|
|
15
15
|
|
16
16
|
logger = sky_logging.init_logger(__name__)
|
17
17
|
|
18
|
+
_HEARTBEAT_INTERVAL = 30
|
19
|
+
|
18
20
|
|
19
21
|
async def _yield_log_file_with_payloads_skipped(
|
20
22
|
log_file) -> AsyncGenerator[str, None]:
|
@@ -90,6 +92,8 @@ async def log_streamer(request_id: Optional[str],
|
|
90
92
|
for line_str in lines:
|
91
93
|
yield line_str
|
92
94
|
|
95
|
+
last_heartbeat_time = asyncio.get_event_loop().time()
|
96
|
+
|
93
97
|
while True:
|
94
98
|
# Sleep 0 to yield control to allow other coroutines to run,
|
95
99
|
# while keeps the loop tight to make log stream responsive.
|
@@ -106,15 +110,32 @@ async def log_streamer(request_id: Optional[str],
|
|
106
110
|
break
|
107
111
|
if not follow:
|
108
112
|
break
|
113
|
+
|
114
|
+
current_time = asyncio.get_event_loop().time()
|
115
|
+
if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
|
116
|
+
# Currently just used to keep the connection busy, refer to
|
117
|
+
# https://github.com/skypilot-org/skypilot/issues/5750 for
|
118
|
+
# more details.
|
119
|
+
yield message_utils.encode_payload(
|
120
|
+
rich_utils.Control.HEARTBEAT.encode(''))
|
121
|
+
last_heartbeat_time = current_time
|
122
|
+
|
109
123
|
# Sleep shortly to avoid storming the DB and CPU, this has
|
110
124
|
# little impact on the responsivness here since we are waiting
|
111
125
|
# for a new line to come in.
|
112
126
|
await asyncio.sleep(0.1)
|
113
127
|
continue
|
128
|
+
|
129
|
+
# Refresh the heartbeat time, this is a trivial optimization for
|
130
|
+
# performance but it helps avoid unnecessary heartbeat strings
|
131
|
+
# being printed when the client runs in an old version.
|
132
|
+
last_heartbeat_time = asyncio.get_event_loop().time()
|
114
133
|
line_str = line.decode('utf-8')
|
115
134
|
if plain_logs:
|
116
135
|
is_payload, line_str = message_utils.decode_payload(
|
117
136
|
line_str, raise_for_mismatch=False)
|
137
|
+
# TODO(aylei): implement heartbeat mechanism for plain logs,
|
138
|
+
# sending invisible characters might be okay.
|
118
139
|
if is_payload:
|
119
140
|
continue
|
120
141
|
yield line_str
|
@@ -395,6 +395,13 @@ available_node_types:
|
|
395
395
|
# STEP 1: Run apt update, install missing packages, and set up ssh.
|
396
396
|
(
|
397
397
|
(
|
398
|
+
# For backwards compatibility, we put a marker file in the pod
|
399
|
+
# to indicate that the apt ssh setup step will write a completion
|
400
|
+
# marker file (/tmp/apt_ssh_setup_complete) to the pod.
|
401
|
+
# TODO: Remove this marker file and its usage in setup_commands
|
402
|
+
# after v0.11.0 release.
|
403
|
+
touch /tmp/apt_ssh_setup_started
|
404
|
+
|
398
405
|
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
|
399
406
|
echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
|
400
407
|
# Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
|
@@ -402,7 +409,7 @@ available_node_types:
|
|
402
409
|
PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
|
403
410
|
|
404
411
|
# Separate packages into two groups: packages that are installed first
|
405
|
-
# so that curl, rsync and wget are available sooner to unblock the following
|
412
|
+
# so that curl, rsync, ssh and wget are available sooner to unblock the following
|
406
413
|
# conda installation and rsync.
|
407
414
|
# Also, we install fuse first to avoid confliction with fuse3.
|
408
415
|
set -e
|
@@ -494,6 +501,8 @@ available_node_types:
|
|
494
501
|
$(prefix_cmd) service ssh restart;
|
495
502
|
$(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
|
496
503
|
|
504
|
+
touch /tmp/apt_ssh_setup_complete
|
505
|
+
echo "=== SSH setup completed ==="
|
497
506
|
) > /tmp/${STEPS[0]}.log 2>&1 || {
|
498
507
|
echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
|
499
508
|
cat /tmp/${STEPS[0]}.log
|
@@ -688,6 +697,13 @@ available_node_types:
|
|
688
697
|
{{k8s_resource_key}}: {{accelerator_count}}
|
689
698
|
{% endif %}
|
690
699
|
{% endif %}
|
700
|
+
{% if k8s_ipc_lock_capability %}
|
701
|
+
securityContext:
|
702
|
+
capabilities:
|
703
|
+
add:
|
704
|
+
- IPC_LOCK
|
705
|
+
{% endif %}
|
706
|
+
|
691
707
|
|
692
708
|
{% if high_availability %}
|
693
709
|
pvc_spec:
|
@@ -791,6 +807,15 @@ setup_commands:
|
|
791
807
|
{%- endfor %}
|
792
808
|
STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
|
793
809
|
start_epoch=$(date +%s);
|
810
|
+
|
811
|
+
# Wait for SSH setup to complete before proceeding
|
812
|
+
if [ -f /tmp/apt_ssh_setup_started ]; then
|
813
|
+
echo "=== Logs for asynchronous SSH setup ===";
|
814
|
+
[ -f /tmp/apt_ssh_setup_complete ] && cat /tmp/${STEPS[0]}.log ||
|
815
|
+
{ tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/apt_ssh_setup_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
816
|
+
[ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
|
817
|
+
fi
|
818
|
+
|
794
819
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
795
820
|
if [ -f /tmp/skypilot_is_nimbus ]; then
|
796
821
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
sky/utils/common_utils.py
CHANGED
@@ -324,9 +324,75 @@ def get_pretty_entrypoint_cmd() -> str:
|
|
324
324
|
# Turn '/.../anaconda/envs/py36/bin/sky' into 'sky', but keep other
|
325
325
|
# things like 'examples/app.py'.
|
326
326
|
argv[0] = basename
|
327
|
+
|
328
|
+
# Redact sensitive environment variable values
|
329
|
+
argv = _redact_env_values(argv)
|
330
|
+
|
327
331
|
return ' '.join(argv)
|
328
332
|
|
329
333
|
|
334
|
+
def _redact_env_values(argv: List[str]) -> List[str]:
|
335
|
+
"""Redact sensitive values from --env arguments.
|
336
|
+
|
337
|
+
Args:
|
338
|
+
argv: Command line arguments
|
339
|
+
|
340
|
+
Returns:
|
341
|
+
Modified argv with redacted --env values, or original argv if any error
|
342
|
+
|
343
|
+
Examples:
|
344
|
+
['sky', 'launch', '--env', 'HF_TOKEN=secret'] ->
|
345
|
+
['sky', 'launch', '--env', 'HF_TOKEN=<redacted>']
|
346
|
+
|
347
|
+
['sky', 'launch', '--env=HF_TOKEN=secret'] ->
|
348
|
+
['sky', 'launch', '--env=HF_TOKEN=<redacted>']
|
349
|
+
|
350
|
+
['sky', 'launch', '--env', 'HF_TOKEN'] ->
|
351
|
+
['sky', 'launch', '--env', 'HF_TOKEN'] (no change)
|
352
|
+
"""
|
353
|
+
try:
|
354
|
+
if not argv:
|
355
|
+
return argv or []
|
356
|
+
|
357
|
+
result = []
|
358
|
+
i = 0
|
359
|
+
|
360
|
+
while i < len(argv):
|
361
|
+
arg = argv[i]
|
362
|
+
|
363
|
+
# Ensure arg is a string
|
364
|
+
if not isinstance(arg, str):
|
365
|
+
result.append(arg)
|
366
|
+
i += 1
|
367
|
+
continue
|
368
|
+
|
369
|
+
if arg == '--env' and i + 1 < len(argv):
|
370
|
+
result.append(arg)
|
371
|
+
next_arg = argv[i + 1]
|
372
|
+
# Ensure next_arg is a string and handle redaction safely
|
373
|
+
if isinstance(next_arg, str):
|
374
|
+
redacted = re.sub(r'^([^=]+)=.*', r'\1=<redacted>',
|
375
|
+
next_arg)
|
376
|
+
result.append(redacted)
|
377
|
+
else:
|
378
|
+
result.append(next_arg)
|
379
|
+
i += 2
|
380
|
+
elif arg.startswith('--env='):
|
381
|
+
# Redact only if there's a value after the key
|
382
|
+
redacted = re.sub(r'^(--env=[^=]+)=.*', r'\1=<redacted>', arg)
|
383
|
+
result.append(redacted)
|
384
|
+
i += 1
|
385
|
+
else:
|
386
|
+
result.append(arg)
|
387
|
+
i += 1
|
388
|
+
|
389
|
+
return result
|
390
|
+
except Exception: # pylint: disable=broad-except
|
391
|
+
# If anything goes wrong with redaction, return original argv
|
392
|
+
# This ensures the command can still execute
|
393
|
+
return argv or []
|
394
|
+
|
395
|
+
|
330
396
|
def user_and_hostname_hash() -> str:
|
331
397
|
"""Returns a string containing <user>-<hostname hash last 4 chars>.
|
332
398
|
|
sky/utils/resources_utils.py
CHANGED
@@ -50,6 +50,32 @@ class DiskTier(enum.Enum):
|
|
50
50
|
return types.index(self) <= types.index(other)
|
51
51
|
|
52
52
|
|
53
|
+
class NetworkTier(enum.Enum):
|
54
|
+
"""All network tiers supported by SkyPilot."""
|
55
|
+
STANDARD = 'standard'
|
56
|
+
BEST = 'best'
|
57
|
+
|
58
|
+
@classmethod
|
59
|
+
def supported_tiers(cls) -> List[str]:
|
60
|
+
return [tier.value for tier in cls]
|
61
|
+
|
62
|
+
@classmethod
|
63
|
+
def cli_help_message(cls) -> str:
|
64
|
+
return (
|
65
|
+
f'Network tier. Could be one of {", ".join(cls.supported_tiers())}'
|
66
|
+
f'. Default: {cls.STANDARD.value}')
|
67
|
+
|
68
|
+
@classmethod
|
69
|
+
def from_str(cls, tier: str) -> 'NetworkTier':
|
70
|
+
if tier not in cls.supported_tiers():
|
71
|
+
raise ValueError(f'Invalid network tier: {tier}')
|
72
|
+
return cls(tier)
|
73
|
+
|
74
|
+
def __le__(self, other: 'NetworkTier') -> bool:
|
75
|
+
types = list(NetworkTier)
|
76
|
+
return types.index(self) <= types.index(other)
|
77
|
+
|
78
|
+
|
53
79
|
class StorageType(enum.Enum):
|
54
80
|
"""Storage type."""
|
55
81
|
# Durable network storage, e.g. GCP persistent disks
|