skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/oci.py
CHANGED
@@ -4,21 +4,37 @@ History:
|
|
4
4
|
- Hysun He (hysun.he@oracle.com) @ Apr, 2023: Initial implementation
|
5
5
|
- Hysun He (hysun.he@oracle.com) @ May 4, 2023: Support use the default
|
6
6
|
image_id (configurable) if no image_id specified in the task yaml.
|
7
|
+
- Hysun He (hysun.he@oracle.com) @ Oct 12, 2024:
|
8
|
+
get_credential_file_mounts(): bug fix for sky config
|
9
|
+
file path resolution (by os.path.expanduser) when construct the file
|
10
|
+
mounts. This bug will cause the created workder nodes located in different
|
11
|
+
compartment and VCN than the header node if user specifies compartment_id
|
12
|
+
in the sky config file, because the ~/.sky/config.yaml is not sync-ed to the
|
13
|
+
remote machine.
|
14
|
+
The workaround is set the sky config file path using ENV before running
|
15
|
+
the sky launch: export SKYPILOT_CONFIG=/home/ubuntu/.sky/config.yaml
|
16
|
+
- Hysun He (hysun.he@oracle.com) @ Oct 12, 2024:
|
17
|
+
make_deploy_resources_variables(): Bug fix for specify the image_id as
|
18
|
+
the ocid of the image in the task.yaml file, in this case the image_id
|
19
|
+
for the node config should be set to the ocid instead of a dict.
|
20
|
+
- Hysun He (hysun.he@oracle.com) @ Oct 13, 2024:
|
21
|
+
Support more OS types additional to ubuntu for OCI resources.
|
7
22
|
"""
|
8
|
-
import json
|
9
23
|
import logging
|
10
24
|
import os
|
11
25
|
import typing
|
12
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
26
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
13
27
|
|
14
28
|
from sky import clouds
|
15
29
|
from sky import exceptions
|
16
|
-
from sky import status_lib
|
17
30
|
from sky.adaptors import oci as oci_adaptor
|
18
31
|
from sky.clouds import service_catalog
|
19
32
|
from sky.clouds.utils import oci_utils
|
33
|
+
from sky.provision.oci.query_utils import query_helper
|
20
34
|
from sky.utils import common_utils
|
35
|
+
from sky.utils import registry
|
21
36
|
from sky.utils import resources_utils
|
37
|
+
from sky.utils import status_lib
|
22
38
|
from sky.utils import ux_utils
|
23
39
|
|
24
40
|
if typing.TYPE_CHECKING:
|
@@ -30,7 +46,7 @@ logger = logging.getLogger(__name__)
|
|
30
46
|
_tenancy_prefix: Optional[str] = None
|
31
47
|
|
32
48
|
|
33
|
-
@
|
49
|
+
@registry.CLOUD_REGISTRY.register
|
34
50
|
class OCI(clouds.Cloud):
|
35
51
|
"""OCI: Oracle Cloud Infrastructure """
|
36
52
|
|
@@ -42,7 +58,12 @@ class OCI(clouds.Cloud):
|
|
42
58
|
|
43
59
|
_INDENT_PREFIX = ' '
|
44
60
|
|
45
|
-
_SUPPORTED_DISK_TIERS = set(resources_utils.DiskTier)
|
61
|
+
_SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) -
|
62
|
+
{resources_utils.DiskTier.ULTRA})
|
63
|
+
_BEST_DISK_TIER = resources_utils.DiskTier.HIGH
|
64
|
+
|
65
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
66
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
46
67
|
|
47
68
|
@classmethod
|
48
69
|
def _unsupported_features_for_resources(
|
@@ -55,8 +76,6 @@ class OCI(clouds.Cloud):
|
|
55
76
|
(f'Docker image is currently not supported on {cls._REPR}. '
|
56
77
|
'You can try running docker command inside the '
|
57
78
|
'`run` section in task.yaml.'),
|
58
|
-
clouds.CloudImplementationFeatures.OPEN_PORTS:
|
59
|
-
(f'Opening ports is currently not supported on {cls._REPR}.'),
|
60
79
|
}
|
61
80
|
if resources.use_spot:
|
62
81
|
features[clouds.CloudImplementationFeatures.STOP] = (
|
@@ -176,7 +195,7 @@ class OCI(clouds.Cloud):
|
|
176
195
|
def get_accelerators_from_instance_type(
|
177
196
|
cls,
|
178
197
|
instance_type: str,
|
179
|
-
) -> Optional[Dict[str, int]]:
|
198
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
180
199
|
return service_catalog.get_accelerators_from_instance_type(
|
181
200
|
instance_type, clouds='oci')
|
182
201
|
|
@@ -187,19 +206,18 @@ class OCI(clouds.Cloud):
|
|
187
206
|
def make_deploy_resources_variables(
|
188
207
|
self,
|
189
208
|
resources: 'resources_lib.Resources',
|
190
|
-
|
209
|
+
cluster_name: resources_utils.ClusterName,
|
191
210
|
region: Optional['clouds.Region'],
|
192
211
|
zones: Optional[List['clouds.Zone']],
|
212
|
+
num_nodes: int,
|
193
213
|
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
194
|
-
del
|
214
|
+
del cluster_name, dryrun # Unused.
|
195
215
|
assert region is not None, resources
|
196
216
|
|
197
217
|
acc_dict = self.get_accelerators_from_instance_type(
|
198
218
|
resources.instance_type)
|
199
|
-
|
200
|
-
|
201
|
-
else:
|
202
|
-
custom_resources = None
|
219
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
220
|
+
acc_dict)
|
203
221
|
|
204
222
|
image_str = self._get_image_id(resources.image_id, region.name,
|
205
223
|
resources.instance_type)
|
@@ -209,10 +227,20 @@ class OCI(clouds.Cloud):
|
|
209
227
|
listing_id = image_cols[1]
|
210
228
|
res_ver = image_cols[2]
|
211
229
|
else:
|
212
|
-
|
230
|
+
# Oct.12,2024 by HysunHe: Bug fix - resources.image_id is an
|
231
|
+
# dict. The image_id here should be the ocid format.
|
232
|
+
image_id = image_str
|
213
233
|
listing_id = None
|
214
234
|
res_ver = None
|
215
235
|
|
236
|
+
os_type = None
|
237
|
+
if ':' in image_id:
|
238
|
+
# OS type provided in the --image-id. This is the case where
|
239
|
+
# custom image's ocid provided in the --image-id parameter.
|
240
|
+
# - ocid1.image...aaa:oraclelinux (os type is oraclelinux)
|
241
|
+
# - ocid1.image...aaa (OS not provided)
|
242
|
+
image_id, os_type = image_id.replace(' ', '').split(':')
|
243
|
+
|
216
244
|
cpus = resources.cpus
|
217
245
|
instance_type_arr = resources.instance_type.split(
|
218
246
|
oci_utils.oci_config.INSTANCE_TYPE_RES_SPERATOR)
|
@@ -278,10 +306,24 @@ class OCI(clouds.Cloud):
|
|
278
306
|
cpus=None if cpus is None else float(cpus),
|
279
307
|
disk_tier=resources.disk_tier)
|
280
308
|
|
309
|
+
if os_type is None:
|
310
|
+
# OS type is not determined yet. So try to get it from vms.csv
|
311
|
+
image_str = self._get_image_str(
|
312
|
+
image_id=resources.image_id,
|
313
|
+
instance_type=resources.instance_type,
|
314
|
+
region=region.name)
|
315
|
+
|
316
|
+
# pylint: disable=import-outside-toplevel
|
317
|
+
from sky.clouds.service_catalog import oci_catalog
|
318
|
+
os_type = oci_catalog.get_image_os_from_tag(tag=image_str,
|
319
|
+
region=region.name)
|
320
|
+
logger.debug(f'OS type for the image {image_id} is {os_type}')
|
321
|
+
|
281
322
|
return {
|
282
323
|
'instance_type': instance_type,
|
283
324
|
'custom_resources': custom_resources,
|
284
325
|
'region': region.name,
|
326
|
+
'os_type': os_type,
|
285
327
|
'cpus': str(cpus),
|
286
328
|
'memory': resources.memory,
|
287
329
|
'disk_size': resources.disk_size,
|
@@ -295,11 +337,13 @@ class OCI(clouds.Cloud):
|
|
295
337
|
|
296
338
|
def _get_feasible_launchable_resources(
|
297
339
|
self, resources: 'resources_lib.Resources'
|
298
|
-
) ->
|
340
|
+
) -> 'resources_utils.FeasibleResources':
|
299
341
|
if resources.instance_type is not None:
|
300
342
|
assert resources.is_launchable(), resources
|
301
343
|
resources = resources.copy(accelerators=None)
|
302
|
-
return
|
344
|
+
# TODO: Add hints to all return values in this method to help
|
345
|
+
# users understand why the resources are not launchable.
|
346
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
303
347
|
|
304
348
|
def _make(instance_list):
|
305
349
|
resource_list = []
|
@@ -326,9 +370,10 @@ class OCI(clouds.Cloud):
|
|
326
370
|
disk_tier=resources.disk_tier)
|
327
371
|
|
328
372
|
if default_instance_type is None:
|
329
|
-
return ([], [])
|
373
|
+
return resources_utils.FeasibleResources([], [], None)
|
330
374
|
else:
|
331
|
-
return (
|
375
|
+
return resources_utils.FeasibleResources(
|
376
|
+
_make([default_instance_type]), [], None)
|
332
377
|
|
333
378
|
assert len(accelerators) == 1, resources
|
334
379
|
|
@@ -344,9 +389,11 @@ class OCI(clouds.Cloud):
|
|
344
389
|
zone=resources.zone,
|
345
390
|
clouds='oci')
|
346
391
|
if instance_list is None:
|
347
|
-
return ([], fuzzy_candidate_list
|
392
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
393
|
+
None)
|
348
394
|
|
349
|
-
return (_make(instance_list),
|
395
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
396
|
+
fuzzy_candidate_list, None)
|
350
397
|
|
351
398
|
@classmethod
|
352
399
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
@@ -355,7 +402,7 @@ class OCI(clouds.Cloud):
|
|
355
402
|
short_credential_help_str = (
|
356
403
|
'For more details, refer to: '
|
357
404
|
# pylint: disable=line-too-long
|
358
|
-
'https://skypilot.
|
405
|
+
'https://docs.skypilot.co/en/latest/getting-started/installation.html#oracle-cloud-infrastructure-oci'
|
359
406
|
)
|
360
407
|
credential_help_str = (
|
361
408
|
'To configure credentials, go to: '
|
@@ -401,7 +448,7 @@ class OCI(clouds.Cloud):
|
|
401
448
|
return True, None
|
402
449
|
except (oci_adaptor.oci.exceptions.ConfigFileNotFound,
|
403
450
|
oci_adaptor.oci.exceptions.InvalidConfig,
|
404
|
-
oci_adaptor.
|
451
|
+
oci_adaptor.oci.exceptions.ServiceError) as e:
|
405
452
|
return False, (
|
406
453
|
f'OCI credential is not correctly set. '
|
407
454
|
f'Check the credential file at {conf_file}\n'
|
@@ -409,22 +456,42 @@ class OCI(clouds.Cloud):
|
|
409
456
|
f'{cls._INDENT_PREFIX}Error details: '
|
410
457
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
411
458
|
|
459
|
+
@classmethod
|
460
|
+
def check_disk_tier(
|
461
|
+
cls, instance_type: Optional[str],
|
462
|
+
disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]:
|
463
|
+
del instance_type # Unused.
|
464
|
+
if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST:
|
465
|
+
return True, ''
|
466
|
+
if disk_tier == resources_utils.DiskTier.ULTRA:
|
467
|
+
return False, ('OCI disk_tier=ultra is not supported now. '
|
468
|
+
'Please use disk_tier={low, medium, high, best} '
|
469
|
+
'instead.')
|
470
|
+
return True, ''
|
471
|
+
|
412
472
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
413
473
|
"""Returns a dict of credential file paths to mount paths."""
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
474
|
+
try:
|
475
|
+
oci_cfg_file = oci_adaptor.get_config_file()
|
476
|
+
# Pass-in a profile parameter so that multiple profile in oci
|
477
|
+
# config file is supported (2023/06/09).
|
478
|
+
oci_cfg = oci_adaptor.get_oci_config(
|
479
|
+
profile=oci_utils.oci_config.get_profile())
|
480
|
+
api_key_file = oci_cfg[
|
481
|
+
'key_file'] if 'key_file' in oci_cfg else 'BadConf'
|
482
|
+
sky_cfg_file = oci_utils.oci_config.get_sky_user_config_file()
|
483
|
+
# Must catch ImportError before any oci_adaptor.oci.exceptions
|
484
|
+
# because oci_adaptor.oci.exceptions can throw ImportError.
|
485
|
+
except ImportError:
|
486
|
+
return {}
|
487
|
+
except oci_adaptor.oci.exceptions.ConfigFileNotFound:
|
488
|
+
return {}
|
422
489
|
|
423
490
|
# OCI config and API key file are mandatory
|
424
491
|
credential_files = [oci_cfg_file, api_key_file]
|
425
492
|
|
426
493
|
# Sky config file is optional
|
427
|
-
if os.path.exists(sky_cfg_file):
|
494
|
+
if os.path.exists(os.path.expanduser(sky_cfg_file)):
|
428
495
|
credential_files.append(sky_cfg_file)
|
429
496
|
|
430
497
|
file_mounts = {
|
@@ -435,7 +502,7 @@ class OCI(clouds.Cloud):
|
|
435
502
|
return file_mounts
|
436
503
|
|
437
504
|
@classmethod
|
438
|
-
def
|
505
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
439
506
|
# NOTE: used for very advanced SkyPilot functionality
|
440
507
|
# Can implement later if desired
|
441
508
|
# If the user switches the compartment_ocid, the existing clusters
|
@@ -463,59 +530,45 @@ class OCI(clouds.Cloud):
|
|
463
530
|
region_name: str,
|
464
531
|
instance_type: str,
|
465
532
|
) -> str:
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
image_id_str = image_id[None]
|
471
|
-
else:
|
472
|
-
assert region_name in image_id, image_id
|
473
|
-
image_id_str = image_id[region_name]
|
533
|
+
image_id_str = self._get_image_str(image_id=image_id,
|
534
|
+
instance_type=instance_type,
|
535
|
+
region=region_name)
|
536
|
+
|
474
537
|
if image_id_str.startswith('skypilot:'):
|
475
538
|
image_id_str = service_catalog.get_image_id_from_tag(image_id_str,
|
476
539
|
region_name,
|
477
540
|
clouds='oci')
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
# implementor, we need to find a better way to handle this.
|
485
|
-
raise exceptions.ResourcesUnavailableError(
|
486
|
-
'! ERR: No image found in catalog for region '
|
487
|
-
f'{region_name}. Try setting a valid image_id.')
|
541
|
+
|
542
|
+
# Image_id should be impossible be None, except for the case when
|
543
|
+
# user specify an image tag which does not exist in the image.csv
|
544
|
+
# catalog file which only possible in "test" / "evaluation" phase.
|
545
|
+
# Therefore, we use assert here.
|
546
|
+
assert image_id_str is not None
|
488
547
|
|
489
548
|
logger.debug(f'Got real image_id {image_id_str}')
|
490
549
|
return image_id_str
|
491
550
|
|
492
|
-
def
|
551
|
+
def _get_image_str(self, image_id: Optional[Dict[Optional[str], str]],
|
552
|
+
instance_type: str, region: str):
|
553
|
+
if image_id is None:
|
554
|
+
image_str = self._get_default_image_tag(instance_type)
|
555
|
+
elif None in image_id:
|
556
|
+
image_str = image_id[None]
|
557
|
+
else:
|
558
|
+
assert region in image_id, image_id
|
559
|
+
image_str = image_id[region]
|
560
|
+
return image_str
|
561
|
+
|
562
|
+
def _get_default_image_tag(self, instance_type: str) -> str:
|
493
563
|
acc = self.get_accelerators_from_instance_type(instance_type)
|
494
564
|
|
495
565
|
if acc is None:
|
496
566
|
image_tag = oci_utils.oci_config.get_default_image_tag()
|
497
|
-
image_id_str = service_catalog.get_image_id_from_tag(image_tag,
|
498
|
-
region_name,
|
499
|
-
clouds='oci')
|
500
567
|
else:
|
501
568
|
assert len(acc) == 1, acc
|
502
569
|
image_tag = oci_utils.oci_config.get_default_gpu_image_tag()
|
503
|
-
image_id_str = service_catalog.get_image_id_from_tag(image_tag,
|
504
|
-
region_name,
|
505
|
-
clouds='oci')
|
506
|
-
|
507
|
-
if image_id_str is not None:
|
508
|
-
logger.debug(
|
509
|
-
f'Got default image_id {image_id_str} from tag {image_tag}')
|
510
|
-
return image_id_str
|
511
570
|
|
512
|
-
|
513
|
-
# CloudVMRayBackend will be correctly triggered.
|
514
|
-
# TODO(zhwu): This is a information leakage to the cloud implementor,
|
515
|
-
# we need to find a better way to handle this.
|
516
|
-
raise exceptions.ResourcesUnavailableError(
|
517
|
-
'ERR: No image found in catalog for region '
|
518
|
-
f'{region_name}. Try update your default image_id settings.')
|
571
|
+
return image_tag
|
519
572
|
|
520
573
|
def get_vpu_from_disktier(
|
521
574
|
self, cpus: Optional[float],
|
@@ -559,25 +612,11 @@ class OCI(clouds.Cloud):
|
|
559
612
|
region: Optional[str], zone: Optional[str],
|
560
613
|
**kwargs) -> List[status_lib.ClusterStatus]:
|
561
614
|
del zone, kwargs # Unused.
|
562
|
-
# Check the lifecycleState definition from the page
|
563
|
-
# https://docs.oracle.com/en-us/iaas/api/#/en/iaas/latest/Instance/
|
564
|
-
status_map = {
|
565
|
-
'PROVISIONING': status_lib.ClusterStatus.INIT,
|
566
|
-
'STARTING': status_lib.ClusterStatus.INIT,
|
567
|
-
'RUNNING': status_lib.ClusterStatus.UP,
|
568
|
-
'STOPPING': status_lib.ClusterStatus.STOPPED,
|
569
|
-
'STOPPED': status_lib.ClusterStatus.STOPPED,
|
570
|
-
'TERMINATED': None,
|
571
|
-
'TERMINATING': None,
|
572
|
-
}
|
573
|
-
|
574
|
-
# pylint: disable=import-outside-toplevel
|
575
|
-
from sky.skylet.providers.oci.query_helper import oci_query_helper
|
576
615
|
|
577
616
|
status_list = []
|
578
617
|
try:
|
579
|
-
vms =
|
580
|
-
|
618
|
+
vms = query_helper.query_instances_by_tags(tag_filters=tag_filters,
|
619
|
+
region=region)
|
581
620
|
except Exception as e: # pylint: disable=broad-except
|
582
621
|
with ux_utils.print_exception_no_traceback():
|
583
622
|
raise exceptions.ClusterStatusFetchingError(
|
@@ -587,9 +626,9 @@ class OCI(clouds.Cloud):
|
|
587
626
|
|
588
627
|
for node in vms:
|
589
628
|
vm_status = node.lifecycle_state
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
629
|
+
sky_status = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY.get(
|
630
|
+
vm_status, None)
|
631
|
+
if sky_status is not None:
|
632
|
+
status_list.append(sky_status)
|
594
633
|
|
595
634
|
return status_list
|
sky/clouds/paperspace.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
""" Paperspace Cloud. """
|
2
2
|
|
3
|
-
import json
|
4
3
|
import typing
|
5
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
6
5
|
|
7
6
|
import requests
|
8
7
|
|
9
8
|
from sky import clouds
|
10
9
|
from sky.clouds import service_catalog
|
11
10
|
from sky.provision.paperspace import utils
|
11
|
+
from sky.utils import registry
|
12
12
|
from sky.utils import resources_utils
|
13
13
|
|
14
14
|
if typing.TYPE_CHECKING:
|
@@ -20,7 +20,7 @@ _CREDENTIAL_FILES = [
|
|
20
20
|
]
|
21
21
|
|
22
22
|
|
23
|
-
@
|
23
|
+
@registry.CLOUD_REGISTRY.register
|
24
24
|
class Paperspace(clouds.Cloud):
|
25
25
|
"""Paperspace GPU Cloud"""
|
26
26
|
|
@@ -162,7 +162,7 @@ class Paperspace(clouds.Cloud):
|
|
162
162
|
|
163
163
|
@classmethod
|
164
164
|
def get_accelerators_from_instance_type(
|
165
|
-
cls, instance_type: str) -> Optional[Dict[str, int]]:
|
165
|
+
cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
166
166
|
return service_catalog.get_accelerators_from_instance_type(
|
167
167
|
instance_type, clouds='paperspace')
|
168
168
|
|
@@ -173,18 +173,17 @@ class Paperspace(clouds.Cloud):
|
|
173
173
|
def make_deploy_resources_variables(
|
174
174
|
self,
|
175
175
|
resources: 'resources_lib.Resources',
|
176
|
-
|
176
|
+
cluster_name: resources_utils.ClusterName,
|
177
177
|
region: 'clouds.Region',
|
178
178
|
zones: Optional[List['clouds.Zone']],
|
179
|
+
num_nodes: int,
|
179
180
|
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
180
|
-
del zones, dryrun
|
181
|
+
del zones, dryrun, cluster_name
|
181
182
|
|
182
183
|
r = resources
|
183
184
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
184
|
-
|
185
|
-
|
186
|
-
else:
|
187
|
-
custom_resources = None
|
185
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
186
|
+
acc_dict)
|
188
187
|
|
189
188
|
return {
|
190
189
|
'instance_type': resources.instance_type,
|
@@ -196,11 +195,13 @@ class Paperspace(clouds.Cloud):
|
|
196
195
|
self, resources: 'resources_lib.Resources'):
|
197
196
|
"""Returns a list of feasible resources for the given resources."""
|
198
197
|
if resources.use_spot:
|
199
|
-
return
|
198
|
+
# TODO: Add hints to all return values in this method to help
|
199
|
+
# users understand why the resources are not launchable.
|
200
|
+
return resources_utils.FeasibleResources([], [], None)
|
200
201
|
if resources.instance_type is not None:
|
201
202
|
assert resources.is_launchable(), resources
|
202
203
|
resources = resources.copy(accelerators=None)
|
203
|
-
return ([resources], [])
|
204
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
204
205
|
|
205
206
|
def _make(instance_list):
|
206
207
|
resource_list = []
|
@@ -223,9 +224,10 @@ class Paperspace(clouds.Cloud):
|
|
223
224
|
memory=resources.memory,
|
224
225
|
disk_tier=resources.disk_tier)
|
225
226
|
if default_instance_type is None:
|
226
|
-
return ([], [])
|
227
|
+
return resources_utils.FeasibleResources([], [], None)
|
227
228
|
else:
|
228
|
-
return (
|
229
|
+
return resources_utils.FeasibleResources(
|
230
|
+
_make([default_instance_type]), [], None)
|
229
231
|
|
230
232
|
assert len(accelerators) == 1, resources
|
231
233
|
acc, acc_count = list(accelerators.items())[0]
|
@@ -241,8 +243,10 @@ class Paperspace(clouds.Cloud):
|
|
241
243
|
clouds='paperspace',
|
242
244
|
))
|
243
245
|
if instance_list is None:
|
244
|
-
return ([], fuzzy_candidate_list
|
245
|
-
|
246
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
247
|
+
None)
|
248
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
249
|
+
fuzzy_candidate_list, None)
|
246
250
|
|
247
251
|
@classmethod
|
248
252
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
@@ -255,7 +259,7 @@ class Paperspace(clouds.Cloud):
|
|
255
259
|
return False, (
|
256
260
|
'Failed to access Paperspace Cloud with credentials.\n '
|
257
261
|
'To configure credentials, follow the instructions at: '
|
258
|
-
'https://skypilot.
|
262
|
+
'https://docs.skypilot.co/en/latest/getting-started/installation.html#paperspace\n '
|
259
263
|
'Generate API key and create a json at `~/.paperspace/config.json` with \n '
|
260
264
|
' {"apiKey": "[YOUR API KEY]"}\n '
|
261
265
|
f'Reason: {str(e)}')
|
@@ -275,7 +279,7 @@ class Paperspace(clouds.Cloud):
|
|
275
279
|
}
|
276
280
|
|
277
281
|
@classmethod
|
278
|
-
def
|
282
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
279
283
|
# NOTE: used for very advanced SkyPilot functionality
|
280
284
|
# Can implement later if desired
|
281
285
|
return None
|