skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,11 @@
|
|
1
|
+
"""Lambda provisioner for SkyPilot."""
|
2
|
+
|
3
|
+
from sky.provision.lambda_cloud.config import bootstrap_instances
|
4
|
+
from sky.provision.lambda_cloud.instance import cleanup_ports
|
5
|
+
from sky.provision.lambda_cloud.instance import get_cluster_info
|
6
|
+
from sky.provision.lambda_cloud.instance import open_ports
|
7
|
+
from sky.provision.lambda_cloud.instance import query_instances
|
8
|
+
from sky.provision.lambda_cloud.instance import run_instances
|
9
|
+
from sky.provision.lambda_cloud.instance import stop_instances
|
10
|
+
from sky.provision.lambda_cloud.instance import terminate_instances
|
11
|
+
from sky.provision.lambda_cloud.instance import wait_instances
|
@@ -0,0 +1,10 @@
|
|
1
|
+
"""Lambda Cloud configuration bootstrapping"""
|
2
|
+
|
3
|
+
from sky.provision import common
|
4
|
+
|
5
|
+
|
6
|
+
def bootstrap_instances(
|
7
|
+
region: str, cluster_name: str,
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
9
|
+
del region, cluster_name # unused
|
10
|
+
return config
|
@@ -0,0 +1,265 @@
|
|
1
|
+
"""Lambda instance provisioning."""
|
2
|
+
|
3
|
+
import time
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
|
+
|
6
|
+
from sky import sky_logging
|
7
|
+
from sky.provision import common
|
8
|
+
import sky.provision.lambda_cloud.lambda_utils as lambda_utils
|
9
|
+
from sky.utils import common_utils
|
10
|
+
from sky.utils import status_lib
|
11
|
+
from sky.utils import ux_utils
|
12
|
+
|
13
|
+
POLL_INTERVAL = 1
|
14
|
+
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
16
|
+
_lambda_client = None
|
17
|
+
|
18
|
+
|
19
|
+
def _get_lambda_client():
|
20
|
+
global _lambda_client
|
21
|
+
if _lambda_client is None:
|
22
|
+
_lambda_client = lambda_utils.LambdaCloudClient()
|
23
|
+
return _lambda_client
|
24
|
+
|
25
|
+
|
26
|
+
def _filter_instances(
|
27
|
+
cluster_name_on_cloud: str,
|
28
|
+
status_filters: Optional[List[str]]) -> Dict[str, Dict[str, Any]]:
|
29
|
+
lambda_client = _get_lambda_client()
|
30
|
+
instances = lambda_client.list_instances()
|
31
|
+
possible_names = [
|
32
|
+
f'{cluster_name_on_cloud}-head',
|
33
|
+
f'{cluster_name_on_cloud}-worker',
|
34
|
+
]
|
35
|
+
|
36
|
+
filtered_instances = {}
|
37
|
+
for instance in instances:
|
38
|
+
if (status_filters is not None and
|
39
|
+
instance['status'] not in status_filters):
|
40
|
+
continue
|
41
|
+
if instance.get('name') in possible_names:
|
42
|
+
filtered_instances[instance['id']] = instance
|
43
|
+
return filtered_instances
|
44
|
+
|
45
|
+
|
46
|
+
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
47
|
+
head_instance_id = None
|
48
|
+
for instance_id, instance in instances.items():
|
49
|
+
if instance['name'].endswith('-head'):
|
50
|
+
head_instance_id = instance_id
|
51
|
+
break
|
52
|
+
return head_instance_id
|
53
|
+
|
54
|
+
|
55
|
+
def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
|
56
|
+
private_ip = instance_info.get('private_ip')
|
57
|
+
if private_ip is None:
|
58
|
+
if single_node:
|
59
|
+
# The Lambda cloud API may return an instance info without
|
60
|
+
# private IP. It does not align with their docs, but we still
|
61
|
+
# allow single-node cluster to proceed with provisioning, by using
|
62
|
+
# 127.0.0.1, as private IP is not critical for single-node case.
|
63
|
+
return '127.0.0.1'
|
64
|
+
msg = f'Failed to retrieve private IP for instance {instance_info}.'
|
65
|
+
logger.error(msg)
|
66
|
+
raise RuntimeError(msg)
|
67
|
+
return private_ip
|
68
|
+
|
69
|
+
|
70
|
+
def run_instances(region: str, cluster_name_on_cloud: str,
|
71
|
+
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
72
|
+
"""Runs instances for the given cluster"""
|
73
|
+
lambda_client = _get_lambda_client()
|
74
|
+
pending_status = ['booting']
|
75
|
+
while True:
|
76
|
+
instances = _filter_instances(cluster_name_on_cloud, pending_status)
|
77
|
+
if not instances:
|
78
|
+
break
|
79
|
+
logger.info(f'Waiting for {len(instances)} instances to be ready.')
|
80
|
+
time.sleep(POLL_INTERVAL)
|
81
|
+
exist_instances = _filter_instances(cluster_name_on_cloud, ['active'])
|
82
|
+
head_instance_id = _get_head_instance_id(exist_instances)
|
83
|
+
|
84
|
+
to_start_count = config.count - len(exist_instances)
|
85
|
+
if to_start_count < 0:
|
86
|
+
raise RuntimeError(
|
87
|
+
f'Cluster {cluster_name_on_cloud} already has '
|
88
|
+
f'{len(exist_instances)} nodes, but {config.count} are required.')
|
89
|
+
if to_start_count == 0:
|
90
|
+
if head_instance_id is None:
|
91
|
+
raise RuntimeError(
|
92
|
+
f'Cluster {cluster_name_on_cloud} has no head node.')
|
93
|
+
logger.info(f'Cluster {cluster_name_on_cloud} already has '
|
94
|
+
f'{len(exist_instances)} nodes, no need to start more.')
|
95
|
+
return common.ProvisionRecord(
|
96
|
+
provider_name='lambda',
|
97
|
+
cluster_name=cluster_name_on_cloud,
|
98
|
+
region=region,
|
99
|
+
zone=None,
|
100
|
+
head_instance_id=head_instance_id,
|
101
|
+
resumed_instance_ids=[],
|
102
|
+
created_instance_ids=[],
|
103
|
+
)
|
104
|
+
|
105
|
+
created_instance_ids = []
|
106
|
+
remote_ssh_key_name = config.authentication_config['remote_key_name']
|
107
|
+
|
108
|
+
def launch_nodes(node_type: str, quantity: int) -> List[str]:
|
109
|
+
try:
|
110
|
+
instance_ids = lambda_client.create_instances(
|
111
|
+
instance_type=config.node_config['InstanceType'],
|
112
|
+
region=region,
|
113
|
+
name=f'{cluster_name_on_cloud}-{node_type}',
|
114
|
+
quantity=quantity,
|
115
|
+
ssh_key_name=remote_ssh_key_name,
|
116
|
+
)
|
117
|
+
logger.info(f'Launched {len(instance_ids)} {node_type} node(s), '
|
118
|
+
f'instance_ids: {instance_ids}')
|
119
|
+
return instance_ids
|
120
|
+
except Exception as e:
|
121
|
+
logger.warning(f'run_instances error: {e}')
|
122
|
+
raise
|
123
|
+
|
124
|
+
if head_instance_id is None:
|
125
|
+
instance_ids = launch_nodes('head', 1)
|
126
|
+
assert len(instance_ids) == 1
|
127
|
+
created_instance_ids.append(instance_ids[0])
|
128
|
+
head_instance_id = instance_ids[0]
|
129
|
+
|
130
|
+
assert head_instance_id is not None, 'head_instance_id should not be None'
|
131
|
+
|
132
|
+
worker_node_count = to_start_count - 1
|
133
|
+
if worker_node_count > 0:
|
134
|
+
instance_ids = launch_nodes('worker', worker_node_count)
|
135
|
+
created_instance_ids.extend(instance_ids)
|
136
|
+
|
137
|
+
while True:
|
138
|
+
instances = _filter_instances(cluster_name_on_cloud, ['active'])
|
139
|
+
if len(instances) == config.count:
|
140
|
+
break
|
141
|
+
|
142
|
+
time.sleep(POLL_INTERVAL)
|
143
|
+
|
144
|
+
return common.ProvisionRecord(
|
145
|
+
provider_name='lambda',
|
146
|
+
cluster_name=cluster_name_on_cloud,
|
147
|
+
region=region,
|
148
|
+
zone=None,
|
149
|
+
head_instance_id=head_instance_id,
|
150
|
+
resumed_instance_ids=[],
|
151
|
+
created_instance_ids=created_instance_ids,
|
152
|
+
)
|
153
|
+
|
154
|
+
|
155
|
+
def wait_instances(region: str, cluster_name_on_cloud: str,
|
156
|
+
state: Optional[status_lib.ClusterStatus]) -> None:
|
157
|
+
del region, cluster_name_on_cloud, state # Unused.
|
158
|
+
|
159
|
+
|
160
|
+
def stop_instances(
|
161
|
+
cluster_name_on_cloud: str,
|
162
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
163
|
+
worker_only: bool = False,
|
164
|
+
) -> None:
|
165
|
+
raise NotImplementedError(
|
166
|
+
'stop_instances is not supported for Lambda Cloud')
|
167
|
+
|
168
|
+
|
169
|
+
def terminate_instances(
|
170
|
+
cluster_name_on_cloud: str,
|
171
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
172
|
+
worker_only: bool = False,
|
173
|
+
) -> None:
|
174
|
+
"""See sky/provision/__init__.py"""
|
175
|
+
del provider_config
|
176
|
+
lambda_client = _get_lambda_client()
|
177
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
178
|
+
|
179
|
+
instance_ids_to_terminate = []
|
180
|
+
for instance_id, instance in instances.items():
|
181
|
+
if worker_only and not instance['name'].endswith('-worker'):
|
182
|
+
continue
|
183
|
+
instance_ids_to_terminate.append(instance_id)
|
184
|
+
|
185
|
+
try:
|
186
|
+
logger.debug(
|
187
|
+
f'Terminating instances {", ".join(instance_ids_to_terminate)}')
|
188
|
+
lambda_client.remove_instances(instance_ids_to_terminate)
|
189
|
+
except Exception as e: # pylint: disable=broad-except
|
190
|
+
with ux_utils.print_exception_no_traceback():
|
191
|
+
raise RuntimeError(
|
192
|
+
f'Failed to terminate instances {instance_ids_to_terminate}: '
|
193
|
+
f'{common_utils.format_exception(e, use_bracket=False)}') from e
|
194
|
+
|
195
|
+
|
196
|
+
def get_cluster_info(
|
197
|
+
region: str,
|
198
|
+
cluster_name_on_cloud: str,
|
199
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
200
|
+
) -> common.ClusterInfo:
|
201
|
+
del region # unused
|
202
|
+
running_instances = _filter_instances(cluster_name_on_cloud, ['active'])
|
203
|
+
single_node = len(running_instances) == 1
|
204
|
+
instances: Dict[str, List[common.InstanceInfo]] = {}
|
205
|
+
head_instance_id = None
|
206
|
+
for instance_id, instance_info in running_instances.items():
|
207
|
+
instances[instance_id] = [
|
208
|
+
common.InstanceInfo(
|
209
|
+
instance_id=instance_id,
|
210
|
+
internal_ip=_get_private_ip(instance_info, single_node),
|
211
|
+
external_ip=instance_info['ip'],
|
212
|
+
ssh_port=22,
|
213
|
+
tags={},
|
214
|
+
)
|
215
|
+
]
|
216
|
+
if instance_info['name'].endswith('-head'):
|
217
|
+
head_instance_id = instance_id
|
218
|
+
|
219
|
+
return common.ClusterInfo(
|
220
|
+
instances=instances,
|
221
|
+
head_instance_id=head_instance_id,
|
222
|
+
provider_name='lambda',
|
223
|
+
provider_config=provider_config,
|
224
|
+
)
|
225
|
+
|
226
|
+
|
227
|
+
def query_instances(
|
228
|
+
cluster_name_on_cloud: str,
|
229
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
230
|
+
non_terminated_only: bool = True,
|
231
|
+
) -> Dict[str, Optional[status_lib.ClusterStatus]]:
|
232
|
+
"""See sky/provision/__init__.py"""
|
233
|
+
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
234
|
+
instances = _filter_instances(cluster_name_on_cloud, None)
|
235
|
+
|
236
|
+
status_map = {
|
237
|
+
'booting': status_lib.ClusterStatus.INIT,
|
238
|
+
'active': status_lib.ClusterStatus.UP,
|
239
|
+
'unhealthy': status_lib.ClusterStatus.INIT,
|
240
|
+
'terminating': None,
|
241
|
+
}
|
242
|
+
statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
|
243
|
+
for instance_id, instance in instances.items():
|
244
|
+
status = status_map.get(instance['status'])
|
245
|
+
if non_terminated_only and status is None:
|
246
|
+
continue
|
247
|
+
statuses[instance_id] = status
|
248
|
+
return statuses
|
249
|
+
|
250
|
+
|
251
|
+
def open_ports(
|
252
|
+
cluster_name_on_cloud: str,
|
253
|
+
ports: List[str],
|
254
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
255
|
+
) -> None:
|
256
|
+
raise NotImplementedError('open_ports is not supported for Lambda Cloud')
|
257
|
+
|
258
|
+
|
259
|
+
def cleanup_ports(
|
260
|
+
cluster_name_on_cloud: str,
|
261
|
+
ports: List[str],
|
262
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
263
|
+
) -> None:
|
264
|
+
"""See sky/provision/__init__.py"""
|
265
|
+
del cluster_name_on_cloud, ports, provider_config # Unused.
|
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Lambda Cloud helper functions."""
|
2
|
+
|
2
3
|
import json
|
3
4
|
import os
|
4
5
|
import time
|
@@ -49,7 +50,7 @@ class Metadata:
|
|
49
50
|
if value is None:
|
50
51
|
if instance_id in metadata:
|
51
52
|
metadata.pop(instance_id) # del entry
|
52
|
-
if
|
53
|
+
if not metadata:
|
53
54
|
if os.path.exists(self.path):
|
54
55
|
os.remove(self.path)
|
55
56
|
return
|
@@ -68,7 +69,7 @@ class Metadata:
|
|
68
69
|
for instance_id in list(metadata.keys()):
|
69
70
|
if instance_id not in instance_ids:
|
70
71
|
del metadata[instance_id]
|
71
|
-
if
|
72
|
+
if not metadata:
|
72
73
|
os.remove(self.path)
|
73
74
|
return
|
74
75
|
with open(self.path, 'w', encoding='utf-8') as f:
|
@@ -76,12 +77,12 @@ class Metadata:
|
|
76
77
|
|
77
78
|
|
78
79
|
def raise_lambda_error(response: requests.Response) -> None:
|
79
|
-
"""Raise LambdaCloudError if appropriate.
|
80
|
+
"""Raise LambdaCloudError if appropriate."""
|
80
81
|
status_code = response.status_code
|
81
82
|
if status_code == 200:
|
82
83
|
return
|
83
84
|
if status_code == 429:
|
84
|
-
# https://docs.lambdalabs.com/cloud/
|
85
|
+
# https://docs.lambdalabs.com/public-cloud/cloud-api/
|
85
86
|
raise LambdaCloudError('Your API requests are being rate limited.')
|
86
87
|
try:
|
87
88
|
resp_json = response.json()
|
@@ -131,23 +132,25 @@ class LambdaCloudClient:
|
|
131
132
|
self.api_key = self._credentials['api_key']
|
132
133
|
self.headers = {'Authorization': f'Bearer {self.api_key}'}
|
133
134
|
|
134
|
-
def create_instances(
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
135
|
+
def create_instances(
|
136
|
+
self,
|
137
|
+
instance_type: str = 'gpu_1x_a100_sxm4',
|
138
|
+
region: str = 'us-east-1',
|
139
|
+
quantity: int = 1,
|
140
|
+
name: str = '',
|
141
|
+
ssh_key_name: str = '',
|
142
|
+
) -> List[str]:
|
140
143
|
"""Launch new instances."""
|
141
144
|
# Optimization:
|
142
145
|
# Most API requests are rate limited at ~1 request every second but
|
143
146
|
# launch requests are rate limited at ~1 request every 10 seconds.
|
144
147
|
# So don't use launch requests to check availability.
|
145
|
-
# See https://docs.lambdalabs.com/cloud/
|
146
|
-
available_regions = self.list_catalog()[instance_type]
|
147
|
-
|
148
|
+
# See https://docs.lambdalabs.com/public-cloud/cloud-api/ for more.
|
149
|
+
available_regions = (self.list_catalog()[instance_type]
|
150
|
+
['regions_with_capacity_available'])
|
148
151
|
available_regions = [reg['name'] for reg in available_regions]
|
149
152
|
if region not in available_regions:
|
150
|
-
if
|
153
|
+
if available_regions:
|
151
154
|
aval_reg = ' '.join(available_regions)
|
152
155
|
else:
|
153
156
|
aval_reg = 'None'
|
@@ -163,27 +166,25 @@ class LambdaCloudClient:
|
|
163
166
|
'instance_type_name': instance_type,
|
164
167
|
'ssh_key_names': [ssh_key_name],
|
165
168
|
'quantity': quantity,
|
166
|
-
'name': name
|
169
|
+
'name': name,
|
167
170
|
})
|
168
171
|
response = _try_request_with_backoff(
|
169
172
|
'post',
|
170
173
|
f'{API_ENDPOINT}/instance-operations/launch',
|
171
174
|
data=data,
|
172
|
-
headers=self.headers
|
175
|
+
headers=self.headers,
|
176
|
+
)
|
173
177
|
return response.json().get('data', []).get('instance_ids', [])
|
174
178
|
|
175
|
-
def remove_instances(self,
|
179
|
+
def remove_instances(self, instance_ids: List[str]) -> Dict[str, Any]:
|
176
180
|
"""Terminate instances."""
|
177
|
-
data = json.dumps({
|
178
|
-
'instance_ids': [
|
179
|
-
instance_ids[0] # TODO(ewzeng) don't hardcode
|
180
|
-
]
|
181
|
-
})
|
181
|
+
data = json.dumps({'instance_ids': instance_ids})
|
182
182
|
response = _try_request_with_backoff(
|
183
183
|
'post',
|
184
184
|
f'{API_ENDPOINT}/instance-operations/terminate',
|
185
185
|
data=data,
|
186
|
-
headers=self.headers
|
186
|
+
headers=self.headers,
|
187
|
+
)
|
187
188
|
return response.json().get('data', []).get('terminated_instances', [])
|
188
189
|
|
189
190
|
def list_instances(self) -> List[Dict[str, Any]]:
|
sky/provision/logging.py
CHANGED
@@ -41,7 +41,7 @@ def setup_provision_logging(log_dir: str):
|
|
41
41
|
# Disable propagation to avoid streaming logs to the console, which is
|
42
42
|
# set up for sky root logger.
|
43
43
|
provision_logger.propagate = False
|
44
|
-
stream_handler =
|
44
|
+
stream_handler = logging.StreamHandler(sys.stdout)
|
45
45
|
stream_handler.flush = sys.stdout.flush # type: ignore
|
46
46
|
stream_handler.setFormatter(sky_logging.DIM_FORMATTER)
|
47
47
|
stream_handler.setLevel(logging.WARNING)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
"""Nebius provisioner for SkyPilot."""
|
2
|
+
|
3
|
+
from sky.provision.nebius.config import bootstrap_instances
|
4
|
+
from sky.provision.nebius.instance import cleanup_ports
|
5
|
+
from sky.provision.nebius.instance import get_cluster_info
|
6
|
+
from sky.provision.nebius.instance import open_ports
|
7
|
+
from sky.provision.nebius.instance import query_instances
|
8
|
+
from sky.provision.nebius.instance import run_instances
|
9
|
+
from sky.provision.nebius.instance import stop_instances
|
10
|
+
from sky.provision.nebius.instance import terminate_instances
|
11
|
+
from sky.provision.nebius.instance import wait_instances
|
@@ -0,0 +1,11 @@
|
|
1
|
+
"""Nebius configuration bootstrapping."""
|
2
|
+
|
3
|
+
from sky.provision import common
|
4
|
+
|
5
|
+
|
6
|
+
def bootstrap_instances(
|
7
|
+
region: str, cluster_name: str,
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
10
|
+
del region, cluster_name # unused
|
11
|
+
return config
|