skypilot-nightly 1.0.0.dev20250530__py3-none-any.whl → 1.0.0.dev20250531__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +3 -2
- sky/cli.py +36 -10
- sky/client/cli.py +36 -10
- sky/clouds/cloud.py +20 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +3 -0
- sky/clouds/fluidstack.py +3 -0
- sky/clouds/gcp.py +10 -3
- sky/clouds/kubernetes.py +70 -4
- sky/clouds/lambda_cloud.py +3 -0
- sky/clouds/nebius.py +2 -0
- sky/clouds/paperspace.py +3 -0
- sky/clouds/runpod.py +2 -0
- sky/clouds/scp.py +3 -0
- sky/clouds/vast.py +3 -0
- sky/clouds/vsphere.py +3 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/bdeJWb62qu7L7FOq1dbXX/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-7458fda7b295f305.js +6 -0
- sky/dashboard/out/_next/static/chunks/37-b638675d511d58b4.js +6 -0
- sky/dashboard/out/_next/static/chunks/{470-4d003c441839094d.js → 470-9e7a479cc8303baa.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{682-f3f1443ed2fba42f.js → 682-5c12535476a21ce3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/856-ab9627e7e8ac35e8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f270e2c9c59fa1a.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-25edb867a41b6b20.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-c0c1dff3cd463d9e.js +11 -0
- sky/dashboard/out/_next/static/css/2b3ee34e586949a3.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/client/sdk.py +3 -0
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +8 -3
- sky/jobs/utils.py +31 -10
- sky/provision/gcp/config.py +3 -1
- sky/provision/gcp/constants.py +10 -0
- sky/resources.py +44 -3
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +1 -0
- sky/templates/kubernetes-ray.yml.j2 +7 -0
- sky/utils/resources_utils.py +26 -0
- sky/utils/schemas.py +3 -0
- {skypilot_nightly-1.0.0.dev20250530.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250530.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/RECORD +62 -62
- sky/dashboard/out/_next/static/Q32Bxr2Pby5tFDW-y5TNg/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-ca00738e2f58ea65.js +0 -6
- sky/dashboard/out/_next/static/chunks/37-64efcd0e9c54bff6.js +0 -6
- sky/dashboard/out/_next/static/chunks/856-02e34c9fc5945066.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-42d3656aba9d2e78.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-20835df7b0c4599c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-258decb65e95f520.js +0 -11
- sky/dashboard/out/_next/static/css/5411b9fb0a783c1c.css +0 -3
- /sky/dashboard/out/_next/static/{Q32Bxr2Pby5tFDW-y5TNg → bdeJWb62qu7L7FOq1dbXX}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-f19ea34b91c33950.js → _app-ad1edd7fe17ea796.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250530.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250530.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250530.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250530.dist-info → skypilot_nightly-1.0.0.dev20250531.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '687edf041d231b7f5003d7bc24c12e3eddd69fae'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250531'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/backends/backend_utils.py
CHANGED
@@ -813,9 +813,9 @@ def write_cluster_config(
|
|
813
813
|
|
814
814
|
# Add kubernetes config fields from ~/.sky/config
|
815
815
|
if isinstance(cloud, clouds.Kubernetes):
|
816
|
+
cluster_config_overrides = to_provision.cluster_config_overrides
|
816
817
|
kubernetes_utils.combine_pod_config_fields(
|
817
|
-
tmp_yaml_path,
|
818
|
-
cluster_config_overrides=to_provision.cluster_config_overrides)
|
818
|
+
tmp_yaml_path, cluster_config_overrides=cluster_config_overrides)
|
819
819
|
kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
|
820
820
|
yaml_obj = common_utils.read_yaml(tmp_yaml_path)
|
821
821
|
pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
|
@@ -3950,11 +3950,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3950
3950
|
job_id: Optional[int] = None,
|
3951
3951
|
job_name: Optional[str] = None,
|
3952
3952
|
controller: bool = False,
|
3953
|
-
follow: bool = True
|
3953
|
+
follow: bool = True,
|
3954
|
+
tail: Optional[int] = None) -> int:
|
3954
3955
|
# if job_name is not None, job_id should be None
|
3955
3956
|
assert job_name is None or job_id is None, (job_name, job_id)
|
3956
3957
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
3957
|
-
job_name, job_id, follow, controller)
|
3958
|
+
job_name, job_id, follow, controller, tail)
|
3958
3959
|
|
3959
3960
|
# With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
|
3960
3961
|
# kill the process, so we need to handle it manually here.
|
sky/cli.py
CHANGED
@@ -415,6 +415,13 @@ _TASK_OPTIONS = [
|
|
415
415
|
case_sensitive=False),
|
416
416
|
required=False,
|
417
417
|
help=resources_utils.DiskTier.cli_help_message()),
|
418
|
+
click.option('--network-tier',
|
419
|
+
default=None,
|
420
|
+
type=click.Choice(
|
421
|
+
resources_utils.NetworkTier.supported_tiers(),
|
422
|
+
case_sensitive=False),
|
423
|
+
required=False,
|
424
|
+
help=resources_utils.NetworkTier.cli_help_message()),
|
418
425
|
click.option(
|
419
426
|
'--use-spot/--no-use-spot',
|
420
427
|
required=False,
|
@@ -696,6 +703,7 @@ def _parse_override_params(
|
|
696
703
|
image_id: Optional[str] = None,
|
697
704
|
disk_size: Optional[int] = None,
|
698
705
|
disk_tier: Optional[str] = None,
|
706
|
+
network_tier: Optional[str] = None,
|
699
707
|
ports: Optional[Tuple[str, ...]] = None,
|
700
708
|
config_override: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
701
709
|
"""Parses the override parameters into a dictionary."""
|
@@ -749,6 +757,11 @@ def _parse_override_params(
|
|
749
757
|
override_params['disk_tier'] = None
|
750
758
|
else:
|
751
759
|
override_params['disk_tier'] = disk_tier
|
760
|
+
if network_tier is not None:
|
761
|
+
if network_tier.lower() == 'none':
|
762
|
+
override_params['network_tier'] = None
|
763
|
+
else:
|
764
|
+
override_params['network_tier'] = network_tier
|
752
765
|
if ports:
|
753
766
|
if any(p.lower() == 'none' for p in ports):
|
754
767
|
if len(ports) > 1:
|
@@ -857,6 +870,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
857
870
|
image_id: Optional[str] = None,
|
858
871
|
disk_size: Optional[int] = None,
|
859
872
|
disk_tier: Optional[str] = None,
|
873
|
+
network_tier: Optional[str] = None,
|
860
874
|
ports: Optional[Tuple[str, ...]] = None,
|
861
875
|
env: Optional[List[Tuple[str, str]]] = None,
|
862
876
|
field_to_ignore: Optional[List[str]] = None,
|
@@ -897,6 +911,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
897
911
|
image_id=image_id,
|
898
912
|
disk_size=disk_size,
|
899
913
|
disk_tier=disk_tier,
|
914
|
+
network_tier=network_tier,
|
900
915
|
ports=ports,
|
901
916
|
config_override=config_override)
|
902
917
|
if field_to_ignore is not None:
|
@@ -1235,6 +1250,7 @@ def launch(
|
|
1235
1250
|
env: List[Tuple[str, str]],
|
1236
1251
|
disk_size: Optional[int],
|
1237
1252
|
disk_tier: Optional[str],
|
1253
|
+
network_tier: Optional[str],
|
1238
1254
|
ports: Tuple[str, ...],
|
1239
1255
|
idle_minutes_to_autostop: Optional[int],
|
1240
1256
|
down: bool, # pylint: disable=redefined-outer-name
|
@@ -1288,6 +1304,7 @@ def launch(
|
|
1288
1304
|
env=env,
|
1289
1305
|
disk_size=disk_size,
|
1290
1306
|
disk_tier=disk_tier,
|
1307
|
+
network_tier=network_tier,
|
1291
1308
|
ports=ports,
|
1292
1309
|
config_override=config_override,
|
1293
1310
|
)
|
@@ -1405,6 +1422,7 @@ def exec(cluster: Optional[str],
|
|
1405
1422
|
memory: Optional[str],
|
1406
1423
|
disk_size: Optional[int],
|
1407
1424
|
disk_tier: Optional[str],
|
1425
|
+
network_tier: Optional[str],
|
1408
1426
|
async_call: bool,
|
1409
1427
|
config_override: Optional[Dict[str, Any]] = None):
|
1410
1428
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
@@ -1500,6 +1518,7 @@ def exec(cluster: Optional[str],
|
|
1500
1518
|
env=env,
|
1501
1519
|
disk_size=disk_size,
|
1502
1520
|
disk_tier=disk_tier,
|
1521
|
+
network_tier=network_tier,
|
1503
1522
|
ports=ports,
|
1504
1523
|
field_to_ignore=['cpus', 'memory', 'disk_size', 'disk_tier', 'ports'],
|
1505
1524
|
config_override=config_override,
|
@@ -4216,6 +4235,7 @@ def jobs_launch(
|
|
4216
4235
|
env: List[Tuple[str, str]],
|
4217
4236
|
disk_size: Optional[int],
|
4218
4237
|
disk_tier: Optional[str],
|
4238
|
+
network_tier: Optional[str],
|
4219
4239
|
ports: Tuple[str],
|
4220
4240
|
priority: Optional[int],
|
4221
4241
|
detach_run: bool,
|
@@ -4262,6 +4282,7 @@ def jobs_launch(
|
|
4262
4282
|
env=env,
|
4263
4283
|
disk_size=disk_size,
|
4264
4284
|
disk_tier=disk_tier,
|
4285
|
+
network_tier=network_tier,
|
4265
4286
|
ports=ports,
|
4266
4287
|
job_recovery=job_recovery,
|
4267
4288
|
priority=priority,
|
@@ -4599,6 +4620,7 @@ def _generate_task_with_service(
|
|
4599
4620
|
memory: Optional[str],
|
4600
4621
|
disk_size: Optional[int],
|
4601
4622
|
disk_tier: Optional[str],
|
4623
|
+
network_tier: Optional[str],
|
4602
4624
|
not_supported_cmd: str,
|
4603
4625
|
) -> sky.Task:
|
4604
4626
|
"""Generate a task with service section from a service YAML file."""
|
@@ -4625,6 +4647,7 @@ def _generate_task_with_service(
|
|
4625
4647
|
env=env,
|
4626
4648
|
disk_size=disk_size,
|
4627
4649
|
disk_tier=disk_tier,
|
4650
|
+
network_tier=network_tier,
|
4628
4651
|
ports=ports,
|
4629
4652
|
)
|
4630
4653
|
if isinstance(task, sky.Dag):
|
@@ -4738,6 +4761,7 @@ def serve_up(
|
|
4738
4761
|
memory: Optional[str],
|
4739
4762
|
disk_size: Optional[int],
|
4740
4763
|
disk_tier: Optional[str],
|
4764
|
+
network_tier: Optional[str],
|
4741
4765
|
yes: bool,
|
4742
4766
|
async_call: bool,
|
4743
4767
|
):
|
@@ -4792,6 +4816,7 @@ def serve_up(
|
|
4792
4816
|
env=env,
|
4793
4817
|
disk_size=disk_size,
|
4794
4818
|
disk_tier=disk_tier,
|
4819
|
+
network_tier=network_tier,
|
4795
4820
|
ports=ports,
|
4796
4821
|
not_supported_cmd='sky serve up',
|
4797
4822
|
)
|
@@ -4836,16 +4861,16 @@ def serve_up(
|
|
4836
4861
|
help='Skip confirmation prompt.')
|
4837
4862
|
@timeline.event
|
4838
4863
|
@usage_lib.entrypoint
|
4839
|
-
def serve_update(
|
4840
|
-
|
4841
|
-
|
4842
|
-
|
4843
|
-
|
4844
|
-
|
4845
|
-
|
4846
|
-
|
4847
|
-
|
4848
|
-
|
4864
|
+
def serve_update(
|
4865
|
+
service_name: str, service_yaml: Tuple[str, ...],
|
4866
|
+
workdir: Optional[str], infra: Optional[str], cloud: Optional[str],
|
4867
|
+
region: Optional[str], zone: Optional[str], num_nodes: Optional[int],
|
4868
|
+
use_spot: Optional[bool], image_id: Optional[str],
|
4869
|
+
env_file: Optional[Dict[str, str]], env: List[Tuple[str, str]],
|
4870
|
+
gpus: Optional[str], instance_type: Optional[str], ports: Tuple[str],
|
4871
|
+
cpus: Optional[str], memory: Optional[str], disk_size: Optional[int],
|
4872
|
+
disk_tier: Optional[str], network_tier: Optional[str], mode: str,
|
4873
|
+
yes: bool, async_call: bool):
|
4849
4874
|
"""Update a SkyServe service.
|
4850
4875
|
|
4851
4876
|
service_yaml must point to a valid YAML file.
|
@@ -4895,6 +4920,7 @@ def serve_update(service_name: str, service_yaml: Tuple[str, ...],
|
|
4895
4920
|
env=env,
|
4896
4921
|
disk_size=disk_size,
|
4897
4922
|
disk_tier=disk_tier,
|
4923
|
+
network_tier=network_tier,
|
4898
4924
|
ports=ports,
|
4899
4925
|
not_supported_cmd='sky serve update',
|
4900
4926
|
)
|
sky/client/cli.py
CHANGED
@@ -415,6 +415,13 @@ _TASK_OPTIONS = [
|
|
415
415
|
case_sensitive=False),
|
416
416
|
required=False,
|
417
417
|
help=resources_utils.DiskTier.cli_help_message()),
|
418
|
+
click.option('--network-tier',
|
419
|
+
default=None,
|
420
|
+
type=click.Choice(
|
421
|
+
resources_utils.NetworkTier.supported_tiers(),
|
422
|
+
case_sensitive=False),
|
423
|
+
required=False,
|
424
|
+
help=resources_utils.NetworkTier.cli_help_message()),
|
418
425
|
click.option(
|
419
426
|
'--use-spot/--no-use-spot',
|
420
427
|
required=False,
|
@@ -696,6 +703,7 @@ def _parse_override_params(
|
|
696
703
|
image_id: Optional[str] = None,
|
697
704
|
disk_size: Optional[int] = None,
|
698
705
|
disk_tier: Optional[str] = None,
|
706
|
+
network_tier: Optional[str] = None,
|
699
707
|
ports: Optional[Tuple[str, ...]] = None,
|
700
708
|
config_override: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
701
709
|
"""Parses the override parameters into a dictionary."""
|
@@ -749,6 +757,11 @@ def _parse_override_params(
|
|
749
757
|
override_params['disk_tier'] = None
|
750
758
|
else:
|
751
759
|
override_params['disk_tier'] = disk_tier
|
760
|
+
if network_tier is not None:
|
761
|
+
if network_tier.lower() == 'none':
|
762
|
+
override_params['network_tier'] = None
|
763
|
+
else:
|
764
|
+
override_params['network_tier'] = network_tier
|
752
765
|
if ports:
|
753
766
|
if any(p.lower() == 'none' for p in ports):
|
754
767
|
if len(ports) > 1:
|
@@ -857,6 +870,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
857
870
|
image_id: Optional[str] = None,
|
858
871
|
disk_size: Optional[int] = None,
|
859
872
|
disk_tier: Optional[str] = None,
|
873
|
+
network_tier: Optional[str] = None,
|
860
874
|
ports: Optional[Tuple[str, ...]] = None,
|
861
875
|
env: Optional[List[Tuple[str, str]]] = None,
|
862
876
|
field_to_ignore: Optional[List[str]] = None,
|
@@ -897,6 +911,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
897
911
|
image_id=image_id,
|
898
912
|
disk_size=disk_size,
|
899
913
|
disk_tier=disk_tier,
|
914
|
+
network_tier=network_tier,
|
900
915
|
ports=ports,
|
901
916
|
config_override=config_override)
|
902
917
|
if field_to_ignore is not None:
|
@@ -1235,6 +1250,7 @@ def launch(
|
|
1235
1250
|
env: List[Tuple[str, str]],
|
1236
1251
|
disk_size: Optional[int],
|
1237
1252
|
disk_tier: Optional[str],
|
1253
|
+
network_tier: Optional[str],
|
1238
1254
|
ports: Tuple[str, ...],
|
1239
1255
|
idle_minutes_to_autostop: Optional[int],
|
1240
1256
|
down: bool, # pylint: disable=redefined-outer-name
|
@@ -1288,6 +1304,7 @@ def launch(
|
|
1288
1304
|
env=env,
|
1289
1305
|
disk_size=disk_size,
|
1290
1306
|
disk_tier=disk_tier,
|
1307
|
+
network_tier=network_tier,
|
1291
1308
|
ports=ports,
|
1292
1309
|
config_override=config_override,
|
1293
1310
|
)
|
@@ -1405,6 +1422,7 @@ def exec(cluster: Optional[str],
|
|
1405
1422
|
memory: Optional[str],
|
1406
1423
|
disk_size: Optional[int],
|
1407
1424
|
disk_tier: Optional[str],
|
1425
|
+
network_tier: Optional[str],
|
1408
1426
|
async_call: bool,
|
1409
1427
|
config_override: Optional[Dict[str, Any]] = None):
|
1410
1428
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
@@ -1500,6 +1518,7 @@ def exec(cluster: Optional[str],
|
|
1500
1518
|
env=env,
|
1501
1519
|
disk_size=disk_size,
|
1502
1520
|
disk_tier=disk_tier,
|
1521
|
+
network_tier=network_tier,
|
1503
1522
|
ports=ports,
|
1504
1523
|
field_to_ignore=['cpus', 'memory', 'disk_size', 'disk_tier', 'ports'],
|
1505
1524
|
config_override=config_override,
|
@@ -4216,6 +4235,7 @@ def jobs_launch(
|
|
4216
4235
|
env: List[Tuple[str, str]],
|
4217
4236
|
disk_size: Optional[int],
|
4218
4237
|
disk_tier: Optional[str],
|
4238
|
+
network_tier: Optional[str],
|
4219
4239
|
ports: Tuple[str],
|
4220
4240
|
priority: Optional[int],
|
4221
4241
|
detach_run: bool,
|
@@ -4262,6 +4282,7 @@ def jobs_launch(
|
|
4262
4282
|
env=env,
|
4263
4283
|
disk_size=disk_size,
|
4264
4284
|
disk_tier=disk_tier,
|
4285
|
+
network_tier=network_tier,
|
4265
4286
|
ports=ports,
|
4266
4287
|
job_recovery=job_recovery,
|
4267
4288
|
priority=priority,
|
@@ -4599,6 +4620,7 @@ def _generate_task_with_service(
|
|
4599
4620
|
memory: Optional[str],
|
4600
4621
|
disk_size: Optional[int],
|
4601
4622
|
disk_tier: Optional[str],
|
4623
|
+
network_tier: Optional[str],
|
4602
4624
|
not_supported_cmd: str,
|
4603
4625
|
) -> sky.Task:
|
4604
4626
|
"""Generate a task with service section from a service YAML file."""
|
@@ -4625,6 +4647,7 @@ def _generate_task_with_service(
|
|
4625
4647
|
env=env,
|
4626
4648
|
disk_size=disk_size,
|
4627
4649
|
disk_tier=disk_tier,
|
4650
|
+
network_tier=network_tier,
|
4628
4651
|
ports=ports,
|
4629
4652
|
)
|
4630
4653
|
if isinstance(task, sky.Dag):
|
@@ -4738,6 +4761,7 @@ def serve_up(
|
|
4738
4761
|
memory: Optional[str],
|
4739
4762
|
disk_size: Optional[int],
|
4740
4763
|
disk_tier: Optional[str],
|
4764
|
+
network_tier: Optional[str],
|
4741
4765
|
yes: bool,
|
4742
4766
|
async_call: bool,
|
4743
4767
|
):
|
@@ -4792,6 +4816,7 @@ def serve_up(
|
|
4792
4816
|
env=env,
|
4793
4817
|
disk_size=disk_size,
|
4794
4818
|
disk_tier=disk_tier,
|
4819
|
+
network_tier=network_tier,
|
4795
4820
|
ports=ports,
|
4796
4821
|
not_supported_cmd='sky serve up',
|
4797
4822
|
)
|
@@ -4836,16 +4861,16 @@ def serve_up(
|
|
4836
4861
|
help='Skip confirmation prompt.')
|
4837
4862
|
@timeline.event
|
4838
4863
|
@usage_lib.entrypoint
|
4839
|
-
def serve_update(
|
4840
|
-
|
4841
|
-
|
4842
|
-
|
4843
|
-
|
4844
|
-
|
4845
|
-
|
4846
|
-
|
4847
|
-
|
4848
|
-
|
4864
|
+
def serve_update(
|
4865
|
+
service_name: str, service_yaml: Tuple[str, ...],
|
4866
|
+
workdir: Optional[str], infra: Optional[str], cloud: Optional[str],
|
4867
|
+
region: Optional[str], zone: Optional[str], num_nodes: Optional[int],
|
4868
|
+
use_spot: Optional[bool], image_id: Optional[str],
|
4869
|
+
env_file: Optional[Dict[str, str]], env: List[Tuple[str, str]],
|
4870
|
+
gpus: Optional[str], instance_type: Optional[str], ports: Tuple[str],
|
4871
|
+
cpus: Optional[str], memory: Optional[str], disk_size: Optional[int],
|
4872
|
+
disk_tier: Optional[str], network_tier: Optional[str], mode: str,
|
4873
|
+
yes: bool, async_call: bool):
|
4849
4874
|
"""Update a SkyServe service.
|
4850
4875
|
|
4851
4876
|
service_yaml must point to a valid YAML file.
|
@@ -4895,6 +4920,7 @@ def serve_update(service_name: str, service_yaml: Tuple[str, ...],
|
|
4895
4920
|
env=env,
|
4896
4921
|
disk_size=disk_size,
|
4897
4922
|
disk_tier=disk_tier,
|
4923
|
+
network_tier=network_tier,
|
4898
4924
|
ports=ports,
|
4899
4925
|
not_supported_cmd='sky serve update',
|
4900
4926
|
)
|
sky/clouds/cloud.py
CHANGED
@@ -45,6 +45,7 @@ class CloudImplementationFeatures(enum.Enum):
|
|
45
45
|
DOCKER_IMAGE = 'docker_image'
|
46
46
|
SPOT_INSTANCE = 'spot_instance'
|
47
47
|
CUSTOM_DISK_TIER = 'custom_disk_tier'
|
48
|
+
CUSTOM_NETWORK_TIER = 'custom_network_tier'
|
48
49
|
OPEN_PORTS = 'open_ports'
|
49
50
|
STORAGE_MOUNTING = 'storage_mounting'
|
50
51
|
HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
|
@@ -139,6 +140,9 @@ class Cloud:
|
|
139
140
|
_DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
|
140
141
|
_BEST_DISK_TIER = resources_utils.DiskTier.ULTRA
|
141
142
|
_SUPPORTED_DISK_TIERS = {resources_utils.DiskTier.BEST}
|
143
|
+
_SUPPORTED_NETWORK_TIERS = {
|
144
|
+
resources_utils.NetworkTier.STANDARD, resources_utils.NetworkTier.BEST
|
145
|
+
}
|
142
146
|
_SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = False
|
143
147
|
|
144
148
|
# The version of provisioner and status query. This is used to determine
|
@@ -715,6 +719,22 @@ class Cloud:
|
|
715
719
|
raise exceptions.NotSupportedError(
|
716
720
|
f'{disk_tier} is not supported by {cls._REPR}.')
|
717
721
|
|
722
|
+
@classmethod
|
723
|
+
def check_network_tier_enabled(
|
724
|
+
cls, instance_type: Optional[str],
|
725
|
+
network_tier: resources_utils.NetworkTier) -> None:
|
726
|
+
"""Errors out if the network tier is not supported by the
|
727
|
+
cloud provider.
|
728
|
+
|
729
|
+
Raises:
|
730
|
+
exceptions.NotSupportedError: If the network tier is not supported.
|
731
|
+
"""
|
732
|
+
del instance_type # unused
|
733
|
+
if network_tier not in cls._SUPPORTED_NETWORK_TIERS:
|
734
|
+
with ux_utils.print_exception_no_traceback():
|
735
|
+
raise exceptions.NotSupportedError(
|
736
|
+
f'{network_tier} is not supported by {cls._REPR}.')
|
737
|
+
|
718
738
|
@classmethod
|
719
739
|
def _translate_disk_tier(
|
720
740
|
cls, disk_tier: Optional[resources_utils.DiskTier]
|
sky/clouds/cudo.py
CHANGED
@@ -59,6 +59,8 @@ class Cudo(clouds.Cloud):
|
|
59
59
|
('Spot is not supported, as Cudo API does not implement spot.'),
|
60
60
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
61
61
|
('Custom disk tier is currently not supported on Cudo Compute'),
|
62
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
63
|
+
('Custom network tier is currently not supported on Cudo Compute'),
|
62
64
|
clouds.CloudImplementationFeatures.IMAGE_ID:
|
63
65
|
('Image ID is currently not supported on Cudo. '),
|
64
66
|
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
|
sky/clouds/do.py
CHANGED
@@ -33,6 +33,9 @@ class DO(clouds.Cloud):
|
|
33
33
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
34
34
|
'Custom disk tiers'
|
35
35
|
f' is not supported in {_REPR}.',
|
36
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
37
|
+
('Custom network tier is currently not supported in '
|
38
|
+
f'{_REPR}.'),
|
36
39
|
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
37
40
|
('High availability controllers are not supported in '
|
38
41
|
f'{_REPR}.'),
|
sky/clouds/fluidstack.py
CHANGED
@@ -53,6 +53,9 @@ class Fluidstack(clouds.Cloud):
|
|
53
53
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
54
54
|
'Custom disk tiers'
|
55
55
|
f' is not supported in {_REPR}.',
|
56
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
57
|
+
('Custom network tier is currently not supported in '
|
58
|
+
f'{_REPR}.'),
|
56
59
|
clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
|
57
60
|
'Host controllers'
|
58
61
|
f' are not supported in {_REPR}.',
|
sky/clouds/gcp.py
CHANGED
@@ -119,6 +119,11 @@ _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-debian-10'
|
|
119
119
|
# Refer to https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a3-highgpu-8g/README.md#before-starting
|
120
120
|
_DEFAULT_GPU_DIRECT_IMAGE_ID = 'skypilot:gpu-direct-cos'
|
121
121
|
|
122
|
+
# From https://cloud.google.com/compute/docs/gpus/gpudirect
|
123
|
+
# A specific image is used to ensure that the the GPU is configured with TCPX support.
|
124
|
+
_NETWORK_GCP_IMAGE_ID = ('docker:us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/'
|
125
|
+
'nccl-plugin-gpudirecttcpx')
|
126
|
+
|
122
127
|
|
123
128
|
def _run_output(cmd):
|
124
129
|
proc = subprocess.run(cmd,
|
@@ -505,6 +510,8 @@ class GCP(clouds.Cloud):
|
|
505
510
|
False,
|
506
511
|
override_configs=resources.cluster_config_overrides)
|
507
512
|
resources_vars['enable_gpu_direct'] = enable_gpu_direct
|
513
|
+
network_tier = r.network_tier
|
514
|
+
resources_vars['network_tier'] = network_tier
|
508
515
|
accelerators = r.accelerators
|
509
516
|
if accelerators is not None:
|
510
517
|
assert len(accelerators) == 1, r
|
@@ -539,8 +546,8 @@ class GCP(clouds.Cloud):
|
|
539
546
|
resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
|
540
547
|
acc.lower())
|
541
548
|
resources_vars['gpu_count'] = acc_count
|
542
|
-
if enable_gpu_direct:
|
543
|
-
image_id =
|
549
|
+
if enable_gpu_direct or network_tier == resources_utils.NetworkTier.BEST:
|
550
|
+
image_id = _NETWORK_GCP_IMAGE_ID
|
544
551
|
else:
|
545
552
|
if acc == 'K80':
|
546
553
|
# Though the image is called cu113, it actually has later
|
@@ -630,7 +637,7 @@ class GCP(clouds.Cloud):
|
|
630
637
|
('gcp', 'placement_policy'),
|
631
638
|
None,
|
632
639
|
override_configs=resources.cluster_config_overrides)
|
633
|
-
if enable_gpu_direct:
|
640
|
+
if enable_gpu_direct or network_tier == resources_utils.NetworkTier.BEST:
|
634
641
|
user_data += constants.GPU_DIRECT_TCPX_USER_DATA
|
635
642
|
docker_run_options += constants.GPU_DIRECT_TCPX_SPECIFIC_OPTIONS
|
636
643
|
if placement_policy is None:
|
sky/clouds/kubernetes.py
CHANGED
@@ -75,6 +75,9 @@ class Kubernetes(clouds.Cloud):
|
|
75
75
|
'tiers are not '
|
76
76
|
'supported in '
|
77
77
|
'Kubernetes.',
|
78
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
79
|
+
('Custom network tier is currently not supported in '
|
80
|
+
f'{_REPR}.'),
|
78
81
|
}
|
79
82
|
|
80
83
|
IMAGE_CPU = 'skypilot:custom-cpu-ubuntu-2004'
|
@@ -127,6 +130,12 @@ class Kubernetes(clouds.Cloud):
|
|
127
130
|
if spot_label_key is not None:
|
128
131
|
unsupported_features.pop(
|
129
132
|
clouds.CloudImplementationFeatures.SPOT_INSTANCE, None)
|
133
|
+
# Allow custom network tier if supported by the cluster
|
134
|
+
# (e.g., Nebius clusters with high performance networking)
|
135
|
+
if cls._cluster_supports_high_performance_networking(context):
|
136
|
+
unsupported_features.pop(
|
137
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER,
|
138
|
+
None)
|
130
139
|
except exceptions.KubeAPIUnreachableError as e:
|
131
140
|
cls._log_unreachable_context(context, str(e))
|
132
141
|
return unsupported_features
|
@@ -471,6 +480,12 @@ class Kubernetes(clouds.Cloud):
|
|
471
480
|
ssh_jump_image = service_catalog.get_image_id_from_tag(
|
472
481
|
self.IMAGE_CPU, clouds='kubernetes')
|
473
482
|
|
483
|
+
# Set environment variables for the pod. Note that SkyPilot env vars
|
484
|
+
# are set separately when the task is run. These env vars are
|
485
|
+
# independent of the SkyPilot task to be run.
|
486
|
+
k8s_env_vars = {kubernetes.IN_CLUSTER_CONTEXT_NAME_ENV_VAR: context}
|
487
|
+
|
488
|
+
# Setup GPU/TPU labels and resource keys.
|
474
489
|
k8s_acc_label_key = None
|
475
490
|
k8s_acc_label_values = None
|
476
491
|
k8s_topology_label_key = None
|
@@ -492,6 +507,17 @@ class Kubernetes(clouds.Cloud):
|
|
492
507
|
else:
|
493
508
|
k8s_resource_key = kubernetes_utils.get_gpu_resource_key()
|
494
509
|
else:
|
510
|
+
# If no GPUs are requested, we set NVIDIA_VISIBLE_DEVICES=none to
|
511
|
+
# maintain GPU isolation. This is to override the default behavior
|
512
|
+
# of Nvidia device plugin which would expose all GPUs to the pod
|
513
|
+
# when no GPUs are requested.
|
514
|
+
# Note that NVIDIA_VISIBLE_DEVICES is different from
|
515
|
+
# CUDA_VISIBLE_DEVICES - the latter is used to control which GPUs
|
516
|
+
# are visible to the application and is set inside the pod, while
|
517
|
+
# the former is used to control which GPUs are visible to the pod
|
518
|
+
# through the nvidia runtime.
|
519
|
+
# See: https://github.com/NVIDIA/k8s-device-plugin/issues/61
|
520
|
+
k8s_env_vars['NVIDIA_VISIBLE_DEVICES'] = 'none'
|
495
521
|
avoid_label_keys = kubernetes_utils.get_accelerator_label_keys(
|
496
522
|
context)
|
497
523
|
if len(avoid_label_keys) == 0:
|
@@ -552,10 +578,22 @@ class Kubernetes(clouds.Cloud):
|
|
552
578
|
timeout,
|
553
579
|
override_configs=resources.cluster_config_overrides)
|
554
580
|
|
555
|
-
#
|
556
|
-
#
|
557
|
-
|
558
|
-
|
581
|
+
# Check if this cluster supports high performance networking and
|
582
|
+
# configure IPC_LOCK capability for clusters like Nebius that support it
|
583
|
+
k8s_ipc_lock_capability = False
|
584
|
+
if (resources.network_tier is not None and
|
585
|
+
resources.network_tier == resources_utils.NetworkTier.BEST):
|
586
|
+
# Only proceed if CUSTOM_NETWORK_TIER is supported by this cluster
|
587
|
+
unsupported_features = self._unsupported_features_for_resources(
|
588
|
+
resources)
|
589
|
+
if clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER \
|
590
|
+
not in unsupported_features:
|
591
|
+
k8s_ipc_lock_capability = True
|
592
|
+
|
593
|
+
if k8s_ipc_lock_capability:
|
594
|
+
k8s_env_vars['NCCL_IB_HCA'] = 'mlx5'
|
595
|
+
k8s_env_vars['UCX_NET_DEVICES'] = \
|
596
|
+
'mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1' # pylint: disable=line-too-long
|
559
597
|
|
560
598
|
# We specify object-store-memory to be 500MB to avoid taking up too
|
561
599
|
# much memory on the head node. 'num-cpus' should be set to limit
|
@@ -621,6 +659,7 @@ class Kubernetes(clouds.Cloud):
|
|
621
659
|
'k8s_high_availability_storage_class_name':
|
622
660
|
(k8s_ha_storage_class_name),
|
623
661
|
'avoid_label_keys': avoid_label_keys,
|
662
|
+
'k8s_ipc_lock_capability': k8s_ipc_lock_capability,
|
624
663
|
}
|
625
664
|
|
626
665
|
# Add kubecontext if it is set. It may be None if SkyPilot is running
|
@@ -904,3 +943,30 @@ class Kubernetes(clouds.Cloud):
|
|
904
943
|
f'{cls.canonical_name()}/{c}'
|
905
944
|
for c in cls.existing_allowed_contexts(silent=True)
|
906
945
|
]
|
946
|
+
|
947
|
+
@classmethod
|
948
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
949
|
+
def _cluster_supports_high_performance_networking(cls,
|
950
|
+
context: str) -> bool:
|
951
|
+
"""Check if the cluster supports high performance networking.
|
952
|
+
|
953
|
+
Currently detects Nebius clusters by checking for nebius.com/ labels
|
954
|
+
on cluster nodes.
|
955
|
+
|
956
|
+
Args:
|
957
|
+
context: The Kubernetes context to check.
|
958
|
+
|
959
|
+
Returns:
|
960
|
+
True if the cluster supports high performance networking.
|
961
|
+
"""
|
962
|
+
try:
|
963
|
+
nodes = kubernetes_utils.get_kubernetes_nodes(context=context)
|
964
|
+
for node in nodes:
|
965
|
+
if node.metadata.labels:
|
966
|
+
for label_key in node.metadata.labels.keys():
|
967
|
+
if label_key.startswith('nebius.com/'):
|
968
|
+
return True
|
969
|
+
except exceptions.KubeAPIUnreachableError:
|
970
|
+
# If we can't reach the cluster, assume no high perf networking
|
971
|
+
return False
|
972
|
+
return False
|
sky/clouds/lambda_cloud.py
CHANGED
@@ -43,6 +43,9 @@ class Lambda(clouds.Cloud):
|
|
43
43
|
clouds.CloudImplementationFeatures.SPOT_INSTANCE: f'Spot instances are not supported in {_REPR}.',
|
44
44
|
clouds.CloudImplementationFeatures.IMAGE_ID: f'Specifying image ID is not supported in {_REPR}.',
|
45
45
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
|
46
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
47
|
+
('Custom network tier is currently not supported in '
|
48
|
+
f'{_REPR}.'),
|
46
49
|
clouds.CloudImplementationFeatures.HOST_CONTROLLERS: f'Host controllers are not supported in {_REPR}.',
|
47
50
|
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: f'High availability controllers are not supported on {_REPR}.',
|
48
51
|
}
|
sky/clouds/nebius.py
CHANGED
@@ -56,6 +56,8 @@ class Nebius(clouds.Cloud):
|
|
56
56
|
(f'Migrating disk is currently not supported on {_REPR}.'),
|
57
57
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
58
58
|
(f'Custom disk tier is currently not supported on {_REPR}.'),
|
59
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
60
|
+
('Custom network tier is currently not supported on Nebius.'),
|
59
61
|
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
60
62
|
('High availability controllers are not supported on Nebius.'),
|
61
63
|
}
|
sky/clouds/paperspace.py
CHANGED
@@ -41,6 +41,9 @@ class Paperspace(clouds.Cloud):
|
|
41
41
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
42
42
|
'Custom disk tiers'
|
43
43
|
f' is not supported in {_REPR}.',
|
44
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
45
|
+
('Custom network tier is currently not supported in '
|
46
|
+
f'{_REPR}.'),
|
44
47
|
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
45
48
|
(f'High availability controllers are not supported in {_REPR}.'),
|
46
49
|
}
|
sky/clouds/runpod.py
CHANGED
@@ -30,6 +30,8 @@ class RunPod(clouds.Cloud):
|
|
30
30
|
'are non-trivial on RunPod.'),
|
31
31
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
32
32
|
('Customizing disk tier is not supported yet on RunPod.'),
|
33
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
34
|
+
('Custom network tier is not supported yet on RunPod.'),
|
33
35
|
clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
|
34
36
|
('Mounting object stores is not supported on RunPod. To read data '
|
35
37
|
'from object stores on RunPod, use `mode: COPY` to copy the data '
|
sky/clouds/scp.py
CHANGED
@@ -56,6 +56,9 @@ class SCP(clouds.Cloud):
|
|
56
56
|
(f'Spot instances are not supported in {_REPR}.'),
|
57
57
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
58
58
|
(f'Custom disk tiers are not supported in {_REPR}.'),
|
59
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
60
|
+
('Custom network tier is currently not supported in '
|
61
|
+
f'{_REPR}.'),
|
59
62
|
clouds.CloudImplementationFeatures.OPEN_PORTS:
|
60
63
|
(f'Opening ports is currently not supported on {_REPR}.'),
|
61
64
|
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
sky/clouds/vast.py
CHANGED
@@ -25,6 +25,9 @@ class Vast(clouds.Cloud):
|
|
25
25
|
'are non-trivial on Vast.'),
|
26
26
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
27
27
|
('Customizing disk tier is not supported yet on Vast.'),
|
28
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
29
|
+
('Custom network tier is currently not supported in '
|
30
|
+
f'{_REPR}.'),
|
28
31
|
clouds.CloudImplementationFeatures.OPEN_PORTS:
|
29
32
|
('Opening ports is currently not supported on Vast.'),
|
30
33
|
clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
|