skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,162 @@
|
|
1
|
+
# pylint: disable=assignment-from-no-return
|
2
|
+
#
|
3
|
+
# The pylint exception above is an accomodation for
|
4
|
+
# false positives generated by pylint for the Vast
|
5
|
+
# python sdk.
|
6
|
+
#
|
7
|
+
"""Vast library wrapper for SkyPilot."""
|
8
|
+
from typing import Any, Dict, List
|
9
|
+
|
10
|
+
from sky import sky_logging
|
11
|
+
from sky.adaptors import vast
|
12
|
+
|
13
|
+
logger = sky_logging.init_logger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def list_instances() -> Dict[str, Dict[str, Any]]:
|
17
|
+
"""Lists instances associated with API key."""
|
18
|
+
instances = vast.vast().show_instances()
|
19
|
+
|
20
|
+
instance_dict: Dict[str, Dict[str, Any]] = {}
|
21
|
+
for instance in instances:
|
22
|
+
instance['id'] = str(instance['id'])
|
23
|
+
info = instance
|
24
|
+
|
25
|
+
if isinstance(instance['actual_status'], str):
|
26
|
+
info['status'] = instance['actual_status'].upper()
|
27
|
+
else:
|
28
|
+
info['status'] = 'UNKNOWN'
|
29
|
+
info['name'] = instance['label']
|
30
|
+
|
31
|
+
instance_dict[instance['id']] = info
|
32
|
+
|
33
|
+
return instance_dict
|
34
|
+
|
35
|
+
|
36
|
+
def launch(name: str, instance_type: str, region: str, disk_size: int,
|
37
|
+
image_name: str, preemptible: bool) -> str:
|
38
|
+
"""Launches an instance with the given parameters.
|
39
|
+
|
40
|
+
Converts the instance_type to the Vast GPU name, finds the specs for the
|
41
|
+
GPU, and launches the instance.
|
42
|
+
|
43
|
+
Notes:
|
44
|
+
|
45
|
+
* `georegion`: This is a feature flag to provide an additional
|
46
|
+
scope of geographical specificy while maintaining backward
|
47
|
+
compatibility.
|
48
|
+
|
49
|
+
* `chunked`: This is a feature flag to give breadth to the
|
50
|
+
snowflake nature of the vast catalog marketplace. It rounds
|
51
|
+
down various specifications of machines to emulate an instance
|
52
|
+
type and make them more interchangeable.
|
53
|
+
|
54
|
+
* `disk_size`: We look for instances that are of the requested
|
55
|
+
size or greater than it. For instance, `disk_size=100` might
|
56
|
+
return something with `disk_size` at 102 or even 1000.
|
57
|
+
|
58
|
+
The disk size {xx} GB is not exactly matched the requested
|
59
|
+
size {yy} GB. It is possible to charge extra cost on disk.
|
60
|
+
|
61
|
+
* `geolocation`: Geolocation on Vast can be as specific as the
|
62
|
+
host chooses to be. They can say, for instance, "Yutakachō,
|
63
|
+
Shinagawa District, Tokyo, JP." Such a specific geolocation
|
64
|
+
as ours would fail to return this host in a simple string
|
65
|
+
comparison if a user searched for "JP".
|
66
|
+
|
67
|
+
Since regardless of specificity, all our geolocations end
|
68
|
+
in two-letter country codes we just snip that to conform
|
69
|
+
to how many providers state their geolocation.
|
70
|
+
|
71
|
+
* Since the catalog is cached, we can't gaurantee availability
|
72
|
+
of any machine at the point of inquiry. As a consequence we
|
73
|
+
search for the machine again and potentially return a failure
|
74
|
+
if there is no availability.
|
75
|
+
|
76
|
+
* We pass in the cpu_ram here as a guarantor to make sure the
|
77
|
+
instance we match with will be compliant with the requested
|
78
|
+
amount of memory.
|
79
|
+
|
80
|
+
* Vast instance types are an invention for skypilot. Refer to
|
81
|
+
service_catalog/vast_catalog.py for the current construction
|
82
|
+
of the type.
|
83
|
+
|
84
|
+
"""
|
85
|
+
cpu_ram = float(instance_type.split('-')[-1]) / 1024
|
86
|
+
gpu_name = instance_type.split('-')[1].replace('_', ' ')
|
87
|
+
num_gpus = int(instance_type.split('-')[0].replace('x', ''))
|
88
|
+
|
89
|
+
query = ' '.join([
|
90
|
+
'chunked=true',
|
91
|
+
'georegion=true',
|
92
|
+
f'geolocation="{region[-2:]}"',
|
93
|
+
f'disk_space>={disk_size}',
|
94
|
+
f'num_gpus={num_gpus}',
|
95
|
+
f'gpu_name="{gpu_name}"',
|
96
|
+
f'cpu_ram>="{cpu_ram}"',
|
97
|
+
])
|
98
|
+
|
99
|
+
instance_list = vast.vast().search_offers(query=query)
|
100
|
+
|
101
|
+
if isinstance(instance_list, int) or len(instance_list) == 0:
|
102
|
+
raise RuntimeError('Failed to create instances, could not find an '
|
103
|
+
f'offer that satisfies the requirements "{query}".')
|
104
|
+
|
105
|
+
instance_touse = instance_list[0]
|
106
|
+
|
107
|
+
launch_params = {
|
108
|
+
'id': instance_touse['id'],
|
109
|
+
'direct': True,
|
110
|
+
'ssh': True,
|
111
|
+
'env': '-e __SOURCE=skypilot',
|
112
|
+
'onstart_cmd': ';'.join([
|
113
|
+
'touch ~/.no_auto_tmux',
|
114
|
+
f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',
|
115
|
+
]),
|
116
|
+
'label': name,
|
117
|
+
'image': image_name,
|
118
|
+
'disk': disk_size
|
119
|
+
}
|
120
|
+
|
121
|
+
if preemptible:
|
122
|
+
launch_params['min_bid'] = instance_touse['min_bid']
|
123
|
+
|
124
|
+
new_instance_contract = vast.vast().create_instance(**launch_params)
|
125
|
+
|
126
|
+
new_instance = vast.vast().show_instance(
|
127
|
+
id=new_instance_contract['new_contract'])
|
128
|
+
|
129
|
+
return new_instance['id']
|
130
|
+
|
131
|
+
|
132
|
+
def start(instance_id: str) -> None:
|
133
|
+
"""Starts the given instance."""
|
134
|
+
vast.vast().start_instance(id=instance_id)
|
135
|
+
|
136
|
+
|
137
|
+
def stop(instance_id: str) -> None:
|
138
|
+
"""Stops the given instance."""
|
139
|
+
vast.vast().stop_instance(id=instance_id)
|
140
|
+
|
141
|
+
|
142
|
+
def remove(instance_id: str) -> None:
|
143
|
+
"""Terminates the given instance."""
|
144
|
+
vast.vast().destroy_instance(id=instance_id)
|
145
|
+
|
146
|
+
|
147
|
+
def get_ssh_ports(cluster_name: str) -> List[int]:
|
148
|
+
"""Gets the SSH ports for the given cluster."""
|
149
|
+
logger.debug(f'Getting SSH ports for cluster {cluster_name}.')
|
150
|
+
|
151
|
+
instances = list_instances()
|
152
|
+
possible_names = [f'{cluster_name}-head', f'{cluster_name}-worker']
|
153
|
+
|
154
|
+
ssh_ports = []
|
155
|
+
|
156
|
+
for instance in instances.values():
|
157
|
+
if instance['name'] in possible_names:
|
158
|
+
ssh_ports.append(instance['ssh_port'])
|
159
|
+
assert ssh_ports, (
|
160
|
+
f'Could not find any instances for cluster {cluster_name}.')
|
161
|
+
|
162
|
+
return ssh_ports
|
@@ -56,7 +56,7 @@ def get_hosts_by_cluster_names(content, vcenter_name, cluster_name_dicts=None):
|
|
56
56
|
'name': cluster.name
|
57
57
|
} for cluster in cluster_view.view]
|
58
58
|
cluster_view.Destroy()
|
59
|
-
if
|
59
|
+
if not cluster_name_dicts:
|
60
60
|
logger.warning(f'vCenter \'{vcenter_name}\' has no clusters')
|
61
61
|
|
62
62
|
# Retrieve all cluster names from the cluster_name_dicts
|
@@ -1,12 +1,9 @@
|
|
1
1
|
"""Vsphere instance provisioning."""
|
2
2
|
import json
|
3
|
-
import os
|
4
3
|
import typing
|
5
4
|
from typing import Any, Dict, List, Optional
|
6
5
|
|
7
|
-
from sky import exceptions
|
8
6
|
from sky import sky_logging
|
9
|
-
from sky import status_lib
|
10
7
|
from sky.adaptors import common as adaptors_common
|
11
8
|
from sky.adaptors import vsphere as vsphere_adaptor
|
12
9
|
from sky.clouds.service_catalog.common import get_catalog_path
|
@@ -18,6 +15,7 @@ from sky.provision.vsphere.common.vim_utils import poweroff_vm
|
|
18
15
|
from sky.provision.vsphere.common.vim_utils import wait_for_tasks
|
19
16
|
from sky.provision.vsphere.common.vim_utils import wait_internal_ip_ready
|
20
17
|
from sky.provision.vsphere.vsphere_utils import VsphereClient
|
18
|
+
from sky.utils import status_lib
|
21
19
|
|
22
20
|
if typing.TYPE_CHECKING:
|
23
21
|
import pandas as pd
|
@@ -30,7 +28,6 @@ TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
|
|
30
28
|
TAG_SKYPILOT_HEAD_NODE = 'skypilot-head-node'
|
31
29
|
HEAD_NODE_VALUE = '1'
|
32
30
|
WORKER_NODE_VALUE = '0'
|
33
|
-
PUBLIC_SSH_KEY_PATH = '~/.ssh/sky-key.pub'
|
34
31
|
|
35
32
|
|
36
33
|
def run_instances(region: str, cluster_name: str,
|
@@ -162,7 +159,7 @@ def _create_instances(
|
|
162
159
|
if not gpu_instance:
|
163
160
|
# Find an image for CPU
|
164
161
|
images_df = images_df[images_df['GpuTags'] == '\'[]\'']
|
165
|
-
if
|
162
|
+
if not images_df:
|
166
163
|
logger.error(
|
167
164
|
f'Can not find an image for instance type: {instance_type}.')
|
168
165
|
raise Exception(
|
@@ -185,7 +182,7 @@ def _create_instances(
|
|
185
182
|
image_instance_mapping_df = image_instance_mapping_df[
|
186
183
|
image_instance_mapping_df['InstanceType'] == instance_type]
|
187
184
|
|
188
|
-
if
|
185
|
+
if not image_instance_mapping_df:
|
189
186
|
raise Exception(f"""There is no image can match instance type named
|
190
187
|
{instance_type}
|
191
188
|
If you are using CPU-only instance, assign an image with tag
|
@@ -218,10 +215,9 @@ def _create_instances(
|
|
218
215
|
hosts_df = hosts_df[(hosts_df['AvailableCPUs'] /
|
219
216
|
hosts_df['cpuMhz']) >= cpus_needed]
|
220
217
|
hosts_df = hosts_df[hosts_df['AvailableMemory(MB)'] >= memory_needed]
|
221
|
-
assert
|
222
|
-
|
223
|
-
|
224
|
-
f'cpus and {memory_needed}MB memory are required.')
|
218
|
+
assert hosts_df, (f'There is no host available to create the instance '
|
219
|
+
f'{vms_item["InstanceType"]}, at least {cpus_needed} '
|
220
|
+
f'cpus and {memory_needed}MB memory are required.')
|
225
221
|
|
226
222
|
# Sort the hosts df by AvailableCPUs to get the compatible host with the
|
227
223
|
# least resource
|
@@ -303,13 +299,7 @@ def _create_instances(
|
|
303
299
|
|
304
300
|
# Create the customization spec
|
305
301
|
# Set up the VM's authorized_keys with customization spec
|
306
|
-
|
307
|
-
if not os.path.exists(ssh_key_path):
|
308
|
-
logger.error('SSH pubic key does not exist.')
|
309
|
-
raise exceptions.ResourcesUnavailableError(
|
310
|
-
'SSH pubic key does not exist.')
|
311
|
-
with open(ssh_key_path, 'r', encoding='utf-8') as f:
|
312
|
-
ssh_public_key = f.read()
|
302
|
+
ssh_public_key = config.authentication_config['ssh_public_key']
|
313
303
|
|
314
304
|
# Create a custom script to inject the ssh public key into the instance
|
315
305
|
vm_user = config.authentication_config['ssh_user']
|
@@ -365,7 +355,7 @@ def _choose_vsphere_cluster_name(config: common.ProvisionConfig, region: str,
|
|
365
355
|
skypilot framework-optimized availability_zones"""
|
366
356
|
vsphere_cluster_name = None
|
367
357
|
vsphere_cluster_name_str = config.provider_config['availability_zone']
|
368
|
-
if
|
358
|
+
if vc_object.clusters:
|
369
359
|
for optimized_cluster_name in vsphere_cluster_name_str.split(','):
|
370
360
|
if optimized_cluster_name in [
|
371
361
|
item['name'] for item in vc_object.clusters
|
@@ -257,7 +257,7 @@ class VsphereClient:
|
|
257
257
|
# hard code here. should support configure later.
|
258
258
|
profile_name = 'skypilot_policy'
|
259
259
|
storage_profile_id = None
|
260
|
-
if
|
260
|
+
if profile_ids:
|
261
261
|
profiles = pm.PbmRetrieveContent(profileIds=profile_ids)
|
262
262
|
for profile in profiles:
|
263
263
|
if profile_name in profile.name:
|