skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/provision/gcp/config.py
CHANGED
@@ -397,7 +397,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
397
397
|
operation = compute.networks().getEffectiveFirewalls(project=project_id,
|
398
398
|
network=vpc_name)
|
399
399
|
response = operation.execute()
|
400
|
-
if
|
400
|
+
if not response:
|
401
401
|
return False
|
402
402
|
effective_rules = response['firewalls']
|
403
403
|
|
@@ -515,7 +515,7 @@ def _create_rules(project_id: str, compute, rules, vpc_name):
|
|
515
515
|
rule_list = _list_firewall_rules(project_id,
|
516
516
|
compute,
|
517
517
|
filter=f'(name={rule_name})')
|
518
|
-
if
|
518
|
+
if rule_list:
|
519
519
|
_delete_firewall_rule(project_id, compute, rule_name)
|
520
520
|
|
521
521
|
body = rule.copy()
|
@@ -624,7 +624,7 @@ def get_usable_vpc_and_subnet(
|
|
624
624
|
vpc_list = _list_vpcnets(project_id,
|
625
625
|
compute,
|
626
626
|
filter=f'name={constants.SKYPILOT_VPC_NAME}')
|
627
|
-
if
|
627
|
+
if not vpc_list:
|
628
628
|
body = constants.VPC_TEMPLATE.copy()
|
629
629
|
body['name'] = body['name'].format(VPC_NAME=constants.SKYPILOT_VPC_NAME)
|
630
630
|
body['selfLink'] = body['selfLink'].format(
|
@@ -670,9 +670,14 @@ def _configure_subnet(region: str, cluster_name: str,
|
|
670
670
|
'accessConfigs': [{
|
671
671
|
'name': 'External NAT',
|
672
672
|
'type': 'ONE_TO_ONE_NAT',
|
673
|
-
}]
|
673
|
+
}]
|
674
674
|
}]
|
675
|
-
if config
|
675
|
+
# Add gVNIC if specified in config
|
676
|
+
enable_gvnic = config.provider_config.get('enable_gvnic', False)
|
677
|
+
if enable_gvnic:
|
678
|
+
default_interfaces[0]['nicType'] = 'gVNIC'
|
679
|
+
enable_external_ips = _enable_external_ips(config)
|
680
|
+
if not enable_external_ips:
|
676
681
|
# Removing this key means the VM will not be assigned an external IP.
|
677
682
|
default_interfaces[0].pop('accessConfigs')
|
678
683
|
|
@@ -686,14 +691,19 @@ def _configure_subnet(region: str, cluster_name: str,
|
|
686
691
|
node_config['networkConfig'] = copy.deepcopy(default_interfaces)[0]
|
687
692
|
# TPU doesn't have accessConfigs
|
688
693
|
node_config['networkConfig'].pop('accessConfigs', None)
|
689
|
-
|
690
|
-
node_config['networkConfig']['enableExternalIps'] = False
|
691
|
-
else:
|
692
|
-
node_config['networkConfig']['enableExternalIps'] = True
|
694
|
+
node_config['networkConfig']['enableExternalIps'] = enable_external_ips
|
693
695
|
|
694
696
|
return config
|
695
697
|
|
696
698
|
|
699
|
+
def _enable_external_ips(config: common.ProvisionConfig) -> bool:
|
700
|
+
force_enable_external_ips = config.provider_config.get(
|
701
|
+
'force_enable_external_ips', False)
|
702
|
+
use_internal_ips = config.provider_config.get('use_internal_ips', False)
|
703
|
+
|
704
|
+
return force_enable_external_ips or not use_internal_ips
|
705
|
+
|
706
|
+
|
697
707
|
def _delete_firewall_rule(project_id: str, compute, name):
|
698
708
|
operation = (compute.firewalls().delete(project=project_id,
|
699
709
|
firewall=name).execute())
|
sky/provision/gcp/constants.py
CHANGED
@@ -142,7 +142,7 @@ FIREWALL_RULES_TEMPLATE = [
|
|
142
142
|
]
|
143
143
|
|
144
144
|
# A list of permissions required to run SkyPilot on GCP.
|
145
|
-
# Keep this in sync with https://skypilot.
|
145
|
+
# Keep this in sync with https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long
|
146
146
|
VM_MINIMAL_PERMISSIONS = [
|
147
147
|
'compute.disks.create',
|
148
148
|
'compute.disks.list',
|
@@ -214,3 +214,9 @@ POLL_INTERVAL = 1
|
|
214
214
|
MAX_POLLS = 60 // POLL_INTERVAL
|
215
215
|
# Stopping instances can take several minutes, so we increase the timeout
|
216
216
|
MAX_POLLS_STOP = MAX_POLLS * 8
|
217
|
+
|
218
|
+
# MIG constants
|
219
|
+
MANAGED_INSTANCE_GROUP_CONFIG = 'managed-instance-group'
|
220
|
+
DEFAULT_MANAGED_INSTANCE_GROUP_PROVISION_TIMEOUT = 900 # 15 minutes
|
221
|
+
MIG_NAME_PREFIX = 'sky-mig-'
|
222
|
+
INSTANCE_TEMPLATE_NAME_PREFIX = 'sky-it-'
|
sky/provision/gcp/instance.py
CHANGED
@@ -7,20 +7,16 @@ import time
|
|
7
7
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Type
|
8
8
|
|
9
9
|
from sky import sky_logging
|
10
|
-
from sky import status_lib
|
11
10
|
from sky.adaptors import gcp
|
12
11
|
from sky.provision import common
|
12
|
+
from sky.provision import constants as provision_constants
|
13
13
|
from sky.provision.gcp import constants
|
14
14
|
from sky.provision.gcp import instance_utils
|
15
15
|
from sky.utils import common_utils
|
16
|
+
from sky.utils import status_lib
|
16
17
|
|
17
18
|
logger = sky_logging.init_logger(__name__)
|
18
19
|
|
19
|
-
TAG_SKYPILOT_HEAD_NODE = 'skypilot-head-node'
|
20
|
-
# Tag uniquely identifying all nodes of a cluster
|
21
|
-
TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
22
|
-
TAG_RAY_NODE_KIND = 'ray-node-type'
|
23
|
-
|
24
20
|
_INSTANCE_RESOURCE_NOT_FOUND_PATTERN = re.compile(
|
25
21
|
r'The resource \'projects/.*/zones/.*/instances/.*\' was not found')
|
26
22
|
|
@@ -56,6 +52,8 @@ def _filter_instances(
|
|
56
52
|
# non_terminated_only=True?
|
57
53
|
# Will there be callers who would want this to be False?
|
58
54
|
# stop() and terminate() for example already implicitly assume non-terminated.
|
55
|
+
# Currently, even with non_terminated_only=False, we may not have a dict entry
|
56
|
+
# for terminated instances, if they have already been fully deleted.
|
59
57
|
@common_utils.retry
|
60
58
|
def query_instances(
|
61
59
|
cluster_name_on_cloud: str,
|
@@ -66,7 +64,9 @@ def query_instances(
|
|
66
64
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
67
65
|
zone = provider_config['availability_zone']
|
68
66
|
project_id = provider_config['project_id']
|
69
|
-
label_filters = {
|
67
|
+
label_filters = {
|
68
|
+
provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
|
69
|
+
}
|
70
70
|
|
71
71
|
handler: Type[
|
72
72
|
instance_utils.GCPInstance] = instance_utils.GCPComputeInstance
|
@@ -124,15 +124,15 @@ def _wait_for_operations(
|
|
124
124
|
logger.debug(
|
125
125
|
f'wait_for_compute_{op_type}_operation: '
|
126
126
|
f'Waiting for operation {operation["name"]} to finish...')
|
127
|
-
handler.wait_for_operation(operation, project_id, zone)
|
127
|
+
handler.wait_for_operation(operation, project_id, zone=zone)
|
128
128
|
|
129
129
|
|
130
130
|
def _get_head_instance_id(instances: List) -> Optional[str]:
|
131
131
|
head_instance_id = None
|
132
132
|
for inst in instances:
|
133
133
|
labels = inst.get('labels', {})
|
134
|
-
if (labels.get(TAG_RAY_NODE_KIND) == 'head' or
|
135
|
-
labels.get(TAG_SKYPILOT_HEAD_NODE) == '1'):
|
134
|
+
if (labels.get(provision_constants.TAG_RAY_NODE_KIND) == 'head' or
|
135
|
+
labels.get(provision_constants.TAG_SKYPILOT_HEAD_NODE) == '1'):
|
136
136
|
head_instance_id = inst['name']
|
137
137
|
break
|
138
138
|
return head_instance_id
|
@@ -158,12 +158,16 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
|
|
158
158
|
resource: Type[instance_utils.GCPInstance]
|
159
159
|
if node_type == instance_utils.GCPNodeType.COMPUTE:
|
160
160
|
resource = instance_utils.GCPComputeInstance
|
161
|
+
elif node_type == instance_utils.GCPNodeType.MIG:
|
162
|
+
resource = instance_utils.GCPManagedInstanceGroup
|
161
163
|
elif node_type == instance_utils.GCPNodeType.TPU:
|
162
164
|
resource = instance_utils.GCPTPUVMInstance
|
163
165
|
else:
|
164
166
|
raise ValueError(f'Unknown node type {node_type}')
|
165
167
|
|
166
|
-
filter_labels = {
|
168
|
+
filter_labels = {
|
169
|
+
provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
|
170
|
+
}
|
167
171
|
|
168
172
|
# wait until all stopping instances are stopped/terminated
|
169
173
|
while True:
|
@@ -264,12 +268,16 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
|
|
264
268
|
if config.resume_stopped_nodes and to_start_count > 0 and stopped_instances:
|
265
269
|
resumed_instance_ids = [n['name'] for n in stopped_instances]
|
266
270
|
if resumed_instance_ids:
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
271
|
+
resumed_instance_ids = resource.start_instances(
|
272
|
+
cluster_name_on_cloud, project_id, availability_zone,
|
273
|
+
resumed_instance_ids, labels)
|
274
|
+
# In MIG case, the resumed_instance_ids will include the previously
|
275
|
+
# PENDING and RUNNING instances. To avoid double counting, we need to
|
276
|
+
# remove them from the resumed_instance_ids.
|
277
|
+
ready_instances = set(resumed_instance_ids)
|
278
|
+
ready_instances |= set([n['name'] for n in running_instances])
|
279
|
+
ready_instances |= set([n['name'] for n in pending_instances])
|
280
|
+
to_start_count = config.count - len(ready_instances)
|
273
281
|
|
274
282
|
if head_instance_id is None:
|
275
283
|
head_instance_id = resource.create_node_tag(
|
@@ -281,9 +289,14 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
|
|
281
289
|
|
282
290
|
if to_start_count > 0:
|
283
291
|
errors, created_instance_ids = resource.create_instances(
|
284
|
-
cluster_name_on_cloud,
|
285
|
-
|
286
|
-
|
292
|
+
cluster_name_on_cloud,
|
293
|
+
project_id,
|
294
|
+
availability_zone,
|
295
|
+
config.node_config,
|
296
|
+
labels,
|
297
|
+
to_start_count,
|
298
|
+
total_count=config.count,
|
299
|
+
include_head_node=head_instance_id is None)
|
287
300
|
if errors:
|
288
301
|
error = common.ProvisionerError('Failed to launch instances.')
|
289
302
|
error.errors = errors
|
@@ -387,7 +400,9 @@ def get_cluster_info(
|
|
387
400
|
assert provider_config is not None, cluster_name_on_cloud
|
388
401
|
zone = provider_config['availability_zone']
|
389
402
|
project_id = provider_config['project_id']
|
390
|
-
label_filters = {
|
403
|
+
label_filters = {
|
404
|
+
provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
|
405
|
+
}
|
391
406
|
|
392
407
|
handlers: List[Type[instance_utils.GCPInstance]] = [
|
393
408
|
instance_utils.GCPComputeInstance
|
@@ -415,7 +430,7 @@ def get_cluster_info(
|
|
415
430
|
project_id,
|
416
431
|
zone,
|
417
432
|
{
|
418
|
-
**label_filters, TAG_RAY_NODE_KIND: 'head'
|
433
|
+
**label_filters, provision_constants.TAG_RAY_NODE_KIND: 'head'
|
419
434
|
},
|
420
435
|
lambda h: [h.RUNNING_STATE],
|
421
436
|
)
|
@@ -441,14 +456,16 @@ def stop_instances(
|
|
441
456
|
assert provider_config is not None, cluster_name_on_cloud
|
442
457
|
zone = provider_config['availability_zone']
|
443
458
|
project_id = provider_config['project_id']
|
444
|
-
label_filters = {
|
459
|
+
label_filters = {
|
460
|
+
provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
|
461
|
+
}
|
445
462
|
|
446
463
|
tpu_node = provider_config.get('tpu_node')
|
447
464
|
if tpu_node is not None:
|
448
465
|
instance_utils.delete_tpu_node(project_id, zone, tpu_node)
|
449
466
|
|
450
467
|
if worker_only:
|
451
|
-
label_filters[TAG_RAY_NODE_KIND] = 'worker'
|
468
|
+
label_filters[provision_constants.TAG_RAY_NODE_KIND] = 'worker'
|
452
469
|
|
453
470
|
handlers: List[Type[instance_utils.GCPInstance]] = [
|
454
471
|
instance_utils.GCPComputeInstance
|
@@ -510,9 +527,18 @@ def terminate_instances(
|
|
510
527
|
if tpu_node is not None:
|
511
528
|
instance_utils.delete_tpu_node(project_id, zone, tpu_node)
|
512
529
|
|
513
|
-
|
530
|
+
use_mig = provider_config.get('use_managed_instance_group', False)
|
531
|
+
if use_mig:
|
532
|
+
# Deleting the MIG will also delete the instances.
|
533
|
+
instance_utils.GCPManagedInstanceGroup.delete_mig(
|
534
|
+
project_id, zone, cluster_name_on_cloud)
|
535
|
+
return
|
536
|
+
|
537
|
+
label_filters = {
|
538
|
+
provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
|
539
|
+
}
|
514
540
|
if worker_only:
|
515
|
-
label_filters[TAG_RAY_NODE_KIND] = 'worker'
|
541
|
+
label_filters[provision_constants.TAG_RAY_NODE_KIND] = 'worker'
|
516
542
|
|
517
543
|
handlers: List[Type[instance_utils.GCPInstance]] = [
|
518
544
|
instance_utils.GCPComputeInstance
|
@@ -555,7 +581,9 @@ def open_ports(
|
|
555
581
|
project_id = provider_config['project_id']
|
556
582
|
firewall_rule_name = provider_config['firewall_rule']
|
557
583
|
|
558
|
-
label_filters = {
|
584
|
+
label_filters = {
|
585
|
+
provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
|
586
|
+
}
|
559
587
|
handlers: List[Type[instance_utils.GCPInstance]] = [
|
560
588
|
instance_utils.GCPComputeInstance,
|
561
589
|
instance_utils.GCPTPUVMInstance,
|
@@ -606,13 +634,6 @@ def cleanup_ports(
|
|
606
634
|
del ports # Unused.
|
607
635
|
assert provider_config is not None, cluster_name_on_cloud
|
608
636
|
project_id = provider_config['project_id']
|
609
|
-
if 'ports' in provider_config:
|
610
|
-
# Backward compatibility for old provider config.
|
611
|
-
# TODO(tian): remove this after 2 minor releases, 0.6.0.
|
612
|
-
for port in provider_config['ports']:
|
613
|
-
firewall_rule_name = f'user-ports-{cluster_name_on_cloud}-{port}'
|
614
|
-
instance_utils.GCPComputeInstance.delete_firewall_rule(
|
615
|
-
project_id, firewall_rule_name)
|
616
637
|
if 'firewall_rule' in provider_config:
|
617
638
|
firewall_rule_name = provider_config['firewall_rule']
|
618
639
|
instance_utils.GCPComputeInstance.delete_firewall_rule(
|