skypilot-nightly 1.0.0.dev20250602__py3-none-any.whl → 1.0.0.dev20250604__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +3 -3
- sky/adaptors/kubernetes.py +8 -0
- sky/backends/backend_utils.py +1 -0
- sky/backends/cloud_vm_ray_backend.py +8 -4
- sky/{clouds/service_catalog → catalog}/__init__.py +6 -17
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +3 -3
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +2 -2
- sky/{clouds/service_catalog → catalog}/common.py +10 -8
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +1 -1
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +1 -1
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +2 -2
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +1 -1
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +2 -2
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +1 -1
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +1 -1
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +1 -1
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +1 -1
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +1 -1
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +1 -1
- sky/{clouds/service_catalog → catalog}/ssh_catalog.py +3 -3
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +1 -1
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +1 -1
- sky/cli.py +7 -6
- sky/client/cli.py +7 -6
- sky/client/sdk.py +3 -4
- sky/clouds/aws.py +41 -40
- sky/clouds/azure.py +31 -34
- sky/clouds/cloud.py +8 -8
- sky/clouds/cudo.py +26 -26
- sky/clouds/do.py +24 -24
- sky/clouds/fluidstack.py +27 -29
- sky/clouds/gcp.py +45 -48
- sky/clouds/ibm.py +26 -26
- sky/clouds/kubernetes.py +24 -12
- sky/clouds/lambda_cloud.py +28 -30
- sky/clouds/nebius.py +26 -28
- sky/clouds/oci.py +32 -32
- sky/clouds/paperspace.py +24 -26
- sky/clouds/runpod.py +26 -28
- sky/clouds/scp.py +37 -36
- sky/clouds/utils/gcp_utils.py +3 -2
- sky/clouds/vast.py +27 -27
- sky/clouds/vsphere.py +12 -15
- sky/core.py +2 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/236-fef38aa6e5639300.js +6 -0
- sky/dashboard/out/_next/static/chunks/37-947904ccc5687bac.js +6 -0
- sky/dashboard/out/_next/static/chunks/682-2be9b0f169727f2f.js +6 -0
- sky/dashboard/out/_next/static/chunks/856-f1b1f7f47edde2e8.js +1 -0
- sky/dashboard/out/_next/static/chunks/969-d7b6fb7f602bfcb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-158b70da336d8607.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-62c9982dc3675725.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-f37ff20f0af29aae.js → clusters-5549a350f97d7ef3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-35383adcb0edb5e2.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-342bc15bb78ab2e5.js → [context]-b68ddeed712d45b5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-7b4b8e7fa9fa0827.js → infra-13b117a831702196.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-a62a3c65dc9bc57c.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/{jobs-78a6c5ba3e24c0cf.js → jobs-a76b2700eca236f7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-89f9212b81d8897e.js → users-07b523ccb19317ad.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/{new-198b6e00d7d724c5.js → new-c7516f2b4c3727c0.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-2ce792183b03c341.js → [name]-7799de9e691e35d8.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-f54921ec9eb20965.js +1 -0
- sky/dashboard/out/_next/static/css/63d3995d8b528eb1.css +3 -0
- sky/dashboard/out/_next/static/vWwfD3jOky5J5jULHp8JT/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +5 -2
- sky/execution.py +1 -2
- sky/global_user_state.py +2 -4
- sky/jobs/server/core.py +1 -1
- sky/jobs/utils.py +31 -1
- sky/optimizer.py +1 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/gcp/constants.py +4 -0
- sky/provision/kubernetes/utils.py +35 -22
- sky/provision/vast/utils.py +1 -1
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +1 -1
- sky/provision/vsphere/vsphere_utils.py +7 -11
- sky/resources.py +33 -2
- sky/serve/server/core.py +1 -1
- sky/server/common.py +86 -53
- sky/server/constants.py +1 -1
- sky/server/requests/executor.py +4 -1
- sky/server/requests/payloads.py +16 -0
- sky/server/requests/serializers/decoders.py +1 -1
- sky/server/server.py +3 -3
- sky/skypilot_config.py +88 -37
- sky/usage/usage_lib.py +4 -3
- sky/utils/accelerator_registry.py +3 -3
- sky/utils/controller_utils.py +4 -14
- sky/utils/kubernetes/deploy_remote_cluster.py +2 -1
- sky/utils/schemas.py +6 -9
- {skypilot_nightly-1.0.0.dev20250602.dist-info → skypilot_nightly-1.0.0.dev20250604.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250602.dist-info → skypilot_nightly-1.0.0.dev20250604.dist-info}/RECORD +127 -126
- sky/dashboard/out/_next/static/chunks/236-7458fda7b295f305.js +0 -6
- sky/dashboard/out/_next/static/chunks/37-b638675d511d58b4.js +0 -6
- sky/dashboard/out/_next/static/chunks/682-5c12535476a21ce3.js +0 -6
- sky/dashboard/out/_next/static/chunks/856-ab9627e7e8ac35e8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f270e2c9c59fa1a.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-25edb867a41b6b20.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/config-3c6a2dabf56e8cd6.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-c0c1dff3cd463d9e.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/workspaces-17d41826537196e7.js +0 -1
- sky/dashboard/out/_next/static/css/2b3ee34e586949a3.css +0 -3
- sky/dashboard/out/_next/static/dev-ndwjPgd_uQ4dcXXiv/_buildManifest.js +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{clouds/service_catalog → catalog}/constants.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-786c36624d5ff61f.js → 843-a097338acb89b7d7.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-ad1edd7fe17ea796.js → _app-67925f5e6382e22f.js} +0 -0
- /sky/dashboard/out/_next/static/{dev-ndwjPgd_uQ4dcXXiv → vWwfD3jOky5J5jULHp8JT}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250602.dist-info → skypilot_nightly-1.0.0.dev20250604.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250602.dist-info → skypilot_nightly-1.0.0.dev20250604.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250602.dist-info → skypilot_nightly-1.0.0.dev20250604.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250602.dist-info → skypilot_nightly-1.0.0.dev20250604.dist-info}/top_level.txt +0 -0
sky/clouds/fluidstack.py
CHANGED
@@ -3,9 +3,9 @@ import os
|
|
3
3
|
import typing
|
4
4
|
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
5
5
|
|
6
|
+
from sky import catalog
|
6
7
|
from sky import clouds
|
7
8
|
from sky.adaptors import common as adaptors_common
|
8
|
-
from sky.clouds import service_catalog
|
9
9
|
from sky.provision.fluidstack import fluidstack_utils
|
10
10
|
from sky.utils import registry
|
11
11
|
from sky.utils import resources_utils
|
@@ -96,7 +96,7 @@ class Fluidstack(clouds.Cloud):
|
|
96
96
|
del accelerators, zone # unused
|
97
97
|
if use_spot:
|
98
98
|
return []
|
99
|
-
regions =
|
99
|
+
regions = catalog.get_region_zones_for_instance_type(
|
100
100
|
instance_type, use_spot, 'fluidstack')
|
101
101
|
|
102
102
|
if region is not None:
|
@@ -128,11 +128,11 @@ class Fluidstack(clouds.Cloud):
|
|
128
128
|
use_spot: bool,
|
129
129
|
region: Optional[str] = None,
|
130
130
|
zone: Optional[str] = None) -> float:
|
131
|
-
return
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
131
|
+
return catalog.get_hourly_cost(instance_type,
|
132
|
+
use_spot=use_spot,
|
133
|
+
region=region,
|
134
|
+
zone=zone,
|
135
|
+
clouds='fluidstack')
|
136
136
|
|
137
137
|
def accelerators_to_hourly_cost(self,
|
138
138
|
accelerators: Dict[str, int],
|
@@ -155,26 +155,26 @@ class Fluidstack(clouds.Cloud):
|
|
155
155
|
cpus: Optional[str] = None,
|
156
156
|
memory: Optional[str] = None,
|
157
157
|
disk_tier: Optional[DiskTier] = None) -> Optional[str]:
|
158
|
-
return
|
159
|
-
|
160
|
-
|
161
|
-
|
158
|
+
return catalog.get_default_instance_type(cpus=cpus,
|
159
|
+
memory=memory,
|
160
|
+
disk_tier=disk_tier,
|
161
|
+
clouds='fluidstack')
|
162
162
|
|
163
163
|
@classmethod
|
164
164
|
def get_accelerators_from_instance_type(
|
165
165
|
cls,
|
166
166
|
instance_type: str,
|
167
167
|
) -> Optional[Dict[str, Union[int, float]]]:
|
168
|
-
return
|
169
|
-
|
168
|
+
return catalog.get_accelerators_from_instance_type(instance_type,
|
169
|
+
clouds='fluidstack')
|
170
170
|
|
171
171
|
@classmethod
|
172
172
|
def get_vcpus_mem_from_instance_type(
|
173
173
|
cls,
|
174
174
|
instance_type: str,
|
175
175
|
) -> Tuple[Optional[float], Optional[float]]:
|
176
|
-
return
|
177
|
-
|
176
|
+
return catalog.get_vcpus_mem_from_instance_type(instance_type,
|
177
|
+
clouds='fluidstack')
|
178
178
|
|
179
179
|
@classmethod
|
180
180
|
def get_zone_shell_cmd(cls) -> Optional[str]:
|
@@ -247,16 +247,16 @@ class Fluidstack(clouds.Cloud):
|
|
247
247
|
|
248
248
|
assert len(accelerators) == 1, resources
|
249
249
|
acc, acc_count = list(accelerators.items())[0]
|
250
|
-
(instance_list,
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
250
|
+
(instance_list,
|
251
|
+
fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
|
252
|
+
acc,
|
253
|
+
acc_count,
|
254
|
+
use_spot=resources.use_spot,
|
255
|
+
cpus=resources.cpus,
|
256
|
+
memory=resources.memory,
|
257
|
+
region=resources.region,
|
258
|
+
zone=resources.zone,
|
259
|
+
clouds='fluidstack')
|
260
260
|
if instance_list is None:
|
261
261
|
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
262
262
|
None)
|
@@ -306,12 +306,10 @@ class Fluidstack(clouds.Cloud):
|
|
306
306
|
return None
|
307
307
|
|
308
308
|
def instance_type_exists(self, instance_type: str) -> bool:
|
309
|
-
return
|
309
|
+
return catalog.instance_type_exists(instance_type, 'fluidstack')
|
310
310
|
|
311
311
|
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
312
|
-
return
|
313
|
-
zone,
|
314
|
-
clouds='fluidstack')
|
312
|
+
return catalog.validate_region_zone(region, zone, clouds='fluidstack')
|
315
313
|
|
316
314
|
@classmethod
|
317
315
|
def query_status(
|
sky/clouds/gcp.py
CHANGED
@@ -10,12 +10,12 @@ from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
|
|
10
10
|
|
11
11
|
import colorama
|
12
12
|
|
13
|
+
from sky import catalog
|
13
14
|
from sky import clouds
|
14
15
|
from sky import exceptions
|
15
16
|
from sky import sky_logging
|
16
17
|
from sky import skypilot_config
|
17
18
|
from sky.adaptors import gcp
|
18
|
-
from sky.clouds import service_catalog
|
19
19
|
from sky.clouds.utils import gcp_utils
|
20
20
|
from sky.provision.gcp import constants
|
21
21
|
from sky.provision.gcp import volume_utils
|
@@ -111,7 +111,7 @@ _IMAGE_NOT_FOUND_UX_MESSAGE = (
|
|
111
111
|
|
112
112
|
# Image ID tags
|
113
113
|
_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-2204'
|
114
|
-
# For GPU-related package version, see sky/clouds/
|
114
|
+
# For GPU-related package version, see sky/clouds/catalog/images/provisioners/cuda.sh
|
115
115
|
_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-2204'
|
116
116
|
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-debian-10'
|
117
117
|
# Use COS image with GPU Direct support.
|
@@ -119,11 +119,6 @@ _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-debian-10'
|
|
119
119
|
# Refer to https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a3-highgpu-8g/README.md#before-starting
|
120
120
|
_DEFAULT_GPU_DIRECT_IMAGE_ID = 'skypilot:gpu-direct-cos'
|
121
121
|
|
122
|
-
# From https://cloud.google.com/compute/docs/gpus/gpudirect
|
123
|
-
# A specific image is used to ensure that the the GPU is configured with TCPX support.
|
124
|
-
_NETWORK_GCP_IMAGE_ID = ('docker:us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/'
|
125
|
-
'nccl-plugin-gpudirecttcpx')
|
126
|
-
|
127
122
|
|
128
123
|
def _run_output(cmd):
|
129
124
|
proc = subprocess.run(cmd,
|
@@ -263,20 +258,21 @@ class GCP(clouds.Cloud):
|
|
263
258
|
use_spot: bool, region: Optional[str],
|
264
259
|
zone: Optional[str]) -> List[clouds.Region]:
|
265
260
|
if accelerators is None:
|
266
|
-
regions =
|
267
|
-
|
261
|
+
regions = catalog.get_region_zones_for_instance_type(instance_type,
|
262
|
+
use_spot,
|
263
|
+
clouds='gcp')
|
268
264
|
else:
|
269
265
|
assert len(accelerators) == 1, accelerators
|
270
266
|
acc = list(accelerators.keys())[0]
|
271
267
|
acc_count = list(accelerators.values())[0]
|
272
|
-
acc_regions =
|
268
|
+
acc_regions = catalog.get_region_zones_for_accelerators(
|
273
269
|
acc, acc_count, use_spot, clouds='gcp')
|
274
270
|
if instance_type is None:
|
275
271
|
regions = acc_regions
|
276
272
|
elif instance_type == 'TPU-VM':
|
277
273
|
regions = acc_regions
|
278
274
|
else:
|
279
|
-
vm_regions =
|
275
|
+
vm_regions = catalog.get_region_zones_for_instance_type(
|
280
276
|
instance_type, use_spot, clouds='gcp')
|
281
277
|
# Find the intersection between `acc_regions` and `vm_regions`.
|
282
278
|
regions = []
|
@@ -346,11 +342,11 @@ class GCP(clouds.Cloud):
|
|
346
342
|
use_spot: bool,
|
347
343
|
region: Optional[str] = None,
|
348
344
|
zone: Optional[str] = None) -> float:
|
349
|
-
return
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
345
|
+
return catalog.get_hourly_cost(instance_type,
|
346
|
+
use_spot=use_spot,
|
347
|
+
region=region,
|
348
|
+
zone=zone,
|
349
|
+
clouds='gcp')
|
354
350
|
|
355
351
|
def accelerators_to_hourly_cost(self,
|
356
352
|
accelerators: Dict[str, int],
|
@@ -359,12 +355,12 @@ class GCP(clouds.Cloud):
|
|
359
355
|
zone: Optional[str] = None) -> float:
|
360
356
|
assert len(accelerators) == 1, accelerators
|
361
357
|
acc, acc_count = list(accelerators.items())[0]
|
362
|
-
return
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
358
|
+
return catalog.get_accelerator_hourly_cost(acc,
|
359
|
+
acc_count,
|
360
|
+
use_spot=use_spot,
|
361
|
+
region=region,
|
362
|
+
zone=zone,
|
363
|
+
clouds='gcp')
|
368
364
|
|
369
365
|
def get_egress_cost(self, num_gigabytes: float):
|
370
366
|
# In general, query this from the cloud:
|
@@ -444,10 +440,10 @@ class GCP(clouds.Cloud):
|
|
444
440
|
memory: Optional[str] = None,
|
445
441
|
disk_tier: Optional[resources_utils.DiskTier] = None
|
446
442
|
) -> Optional[str]:
|
447
|
-
return
|
448
|
-
|
449
|
-
|
450
|
-
|
443
|
+
return catalog.get_default_instance_type(cpus=cpus,
|
444
|
+
memory=memory,
|
445
|
+
disk_tier=disk_tier,
|
446
|
+
clouds='gcp')
|
451
447
|
|
452
448
|
@classmethod
|
453
449
|
def failover_disk_tier(
|
@@ -547,7 +543,9 @@ class GCP(clouds.Cloud):
|
|
547
543
|
acc.lower())
|
548
544
|
resources_vars['gpu_count'] = acc_count
|
549
545
|
if enable_gpu_direct or network_tier == resources_utils.NetworkTier.BEST:
|
550
|
-
|
546
|
+
# The actual image id is set in resources.py (see _try_validate_image_id)
|
547
|
+
# and reference GCP_GPU_DIRECT_IMAGE_ID
|
548
|
+
image_id = _DEFAULT_GPU_DIRECT_IMAGE_ID
|
551
549
|
else:
|
552
550
|
if acc == 'K80':
|
553
551
|
# Though the image is called cu113, it actually has later
|
@@ -566,8 +564,7 @@ class GCP(clouds.Cloud):
|
|
566
564
|
assert region_name in resources.image_id, resources.image_id
|
567
565
|
image_id = resources.image_id[region_name]
|
568
566
|
if image_id.startswith('skypilot:'):
|
569
|
-
image_id =
|
570
|
-
clouds='gcp')
|
567
|
+
image_id = catalog.get_image_id_from_tag(image_id, clouds='gcp')
|
571
568
|
|
572
569
|
assert image_id is not None, (image_id, r)
|
573
570
|
resources_vars['image_id'] = image_id
|
@@ -692,16 +689,16 @@ class GCP(clouds.Cloud):
|
|
692
689
|
|
693
690
|
# For TPU VMs, the instance type is fixed to 'TPU-VM'. However, we still
|
694
691
|
# need to call the below function to get the fuzzy candidate list.
|
695
|
-
(instance_list,
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
692
|
+
(instance_list,
|
693
|
+
fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
|
694
|
+
acc,
|
695
|
+
acc_count,
|
696
|
+
cpus=resources.cpus if not use_tpu_vm else None,
|
697
|
+
memory=resources.memory if not use_tpu_vm else None,
|
698
|
+
use_spot=resources.use_spot,
|
699
|
+
region=resources.region,
|
700
|
+
zone=resources.zone,
|
701
|
+
clouds='gcp')
|
705
702
|
|
706
703
|
if instance_list is None:
|
707
704
|
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
@@ -768,16 +765,16 @@ class GCP(clouds.Cloud):
|
|
768
765
|
# GCP handles accelerators separately from regular instance types.
|
769
766
|
# This method supports automatically inferring the GPU type for
|
770
767
|
# the instance type that come with GPUs pre-attached.
|
771
|
-
return
|
772
|
-
|
768
|
+
return catalog.get_accelerators_from_instance_type(instance_type,
|
769
|
+
clouds='gcp')
|
773
770
|
|
774
771
|
@classmethod
|
775
772
|
def get_vcpus_mem_from_instance_type(
|
776
773
|
cls,
|
777
774
|
instance_type: str,
|
778
775
|
) -> Tuple[Optional[float], Optional[float]]:
|
779
|
-
return
|
780
|
-
|
776
|
+
return catalog.get_vcpus_mem_from_instance_type(instance_type,
|
777
|
+
clouds='gcp')
|
781
778
|
|
782
779
|
@classmethod
|
783
780
|
def _find_application_key_path(cls) -> str:
|
@@ -1061,7 +1058,7 @@ class GCP(clouds.Cloud):
|
|
1061
1058
|
return user_identity[0].replace('\n', '')
|
1062
1059
|
|
1063
1060
|
def instance_type_exists(self, instance_type):
|
1064
|
-
return
|
1061
|
+
return catalog.instance_type_exists(instance_type, 'gcp')
|
1065
1062
|
|
1066
1063
|
def need_cleanup_after_preemption_or_failure(
|
1067
1064
|
self, resources: 'resources.Resources') -> bool:
|
@@ -1096,9 +1093,9 @@ class GCP(clouds.Cloud):
|
|
1096
1093
|
def _check_instance_type_accelerators_combination(
|
1097
1094
|
resources: 'resources.Resources') -> None:
|
1098
1095
|
resources = resources.assert_launchable()
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1096
|
+
catalog.check_accelerator_attachable_to_host(resources.instance_type,
|
1097
|
+
resources.accelerators,
|
1098
|
+
resources.zone, 'gcp')
|
1102
1099
|
|
1103
1100
|
@classmethod
|
1104
1101
|
def check_disk_tier(
|
@@ -1325,7 +1322,7 @@ class GCP(clouds.Cloud):
|
|
1325
1322
|
region = resources.region
|
1326
1323
|
|
1327
1324
|
# pylint: disable=import-outside-toplevel
|
1328
|
-
from sky.
|
1325
|
+
from sky.catalog import gcp_catalog
|
1329
1326
|
|
1330
1327
|
quota_code = gcp_catalog.get_quota_code(accelerator, use_spot)
|
1331
1328
|
|
sky/clouds/ibm.py
CHANGED
@@ -5,11 +5,11 @@ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
|
5
5
|
|
6
6
|
import colorama
|
7
7
|
|
8
|
+
from sky import catalog
|
8
9
|
from sky import clouds
|
9
10
|
from sky import sky_logging
|
10
11
|
from sky.adaptors import ibm
|
11
12
|
from sky.adaptors.ibm import CREDENTIAL_FILE
|
12
|
-
from sky.clouds import service_catalog
|
13
13
|
from sky.utils import registry
|
14
14
|
from sky.utils import resources_utils
|
15
15
|
from sky.utils import status_lib
|
@@ -71,7 +71,7 @@ class IBM(clouds.Cloud):
|
|
71
71
|
del accelerators # unused
|
72
72
|
if use_spot:
|
73
73
|
return []
|
74
|
-
regions =
|
74
|
+
regions = catalog.get_region_zones_for_instance_type(
|
75
75
|
instance_type, use_spot, 'ibm')
|
76
76
|
|
77
77
|
if region is not None:
|
@@ -131,11 +131,11 @@ class IBM(clouds.Cloud):
|
|
131
131
|
zone: Optional[str] = None) -> float:
|
132
132
|
# Currently doesn't support spot instances, hence use_spot set to False.
|
133
133
|
del use_spot
|
134
|
-
return
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
134
|
+
return catalog.get_hourly_cost(instance_type,
|
135
|
+
use_spot=False,
|
136
|
+
region=region,
|
137
|
+
zone=zone,
|
138
|
+
clouds='ibm')
|
139
139
|
|
140
140
|
def accelerators_to_hourly_cost(self,
|
141
141
|
accelerators: Dict[str, int],
|
@@ -243,8 +243,8 @@ class IBM(clouds.Cloud):
|
|
243
243
|
cls,
|
244
244
|
instance_type: str,
|
245
245
|
) -> Tuple[Optional[float], Optional[float]]:
|
246
|
-
return
|
247
|
-
|
246
|
+
return catalog.get_vcpus_mem_from_instance_type(instance_type,
|
247
|
+
clouds='ibm')
|
248
248
|
|
249
249
|
@classmethod
|
250
250
|
def get_accelerators_from_instance_type(
|
@@ -252,8 +252,8 @@ class IBM(clouds.Cloud):
|
|
252
252
|
instance_type: str,
|
253
253
|
) -> Optional[Dict[str, Union[int, float]]]:
|
254
254
|
"""Returns {acc: acc_count} held by 'instance_type', if any."""
|
255
|
-
return
|
256
|
-
|
255
|
+
return catalog.get_accelerators_from_instance_type(instance_type,
|
256
|
+
clouds='ibm')
|
257
257
|
|
258
258
|
@classmethod
|
259
259
|
def get_default_instance_type(
|
@@ -262,10 +262,10 @@ class IBM(clouds.Cloud):
|
|
262
262
|
memory: Optional[str] = None,
|
263
263
|
disk_tier: Optional['resources_utils.DiskTier'] = None
|
264
264
|
) -> Optional[str]:
|
265
|
-
return
|
266
|
-
|
267
|
-
|
268
|
-
|
265
|
+
return catalog.get_default_instance_type(cpus=cpus,
|
266
|
+
memory=memory,
|
267
|
+
disk_tier=disk_tier,
|
268
|
+
clouds='ibm')
|
269
269
|
|
270
270
|
def _get_feasible_launchable_resources(
|
271
271
|
self, resources: 'resources_lib.Resources'
|
@@ -309,15 +309,15 @@ class IBM(clouds.Cloud):
|
|
309
309
|
|
310
310
|
assert len(accelerators) == 1, resources
|
311
311
|
acc, acc_count = list(accelerators.items())[0]
|
312
|
-
(instance_list,
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
312
|
+
(instance_list,
|
313
|
+
fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
|
314
|
+
acc,
|
315
|
+
acc_count,
|
316
|
+
cpus=resources.cpus,
|
317
|
+
memory=resources.memory,
|
318
|
+
region=resources.region,
|
319
|
+
zone=resources.zone,
|
320
|
+
clouds='ibm')
|
321
321
|
if instance_list is None:
|
322
322
|
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
323
323
|
None)
|
@@ -462,11 +462,11 @@ class IBM(clouds.Cloud):
|
|
462
462
|
|
463
463
|
def instance_type_exists(self, instance_type):
|
464
464
|
"""Returns whether the instance type exists for this cloud."""
|
465
|
-
return
|
465
|
+
return catalog.instance_type_exists(instance_type, clouds='ibm')
|
466
466
|
|
467
467
|
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
468
468
|
"""Validates the region and zone."""
|
469
|
-
return
|
469
|
+
return catalog.validate_region_zone(region, zone, clouds='ibm')
|
470
470
|
|
471
471
|
@classmethod
|
472
472
|
def query_status(cls, name: str, tag_filters: Dict[str, str],
|
sky/clouds/kubernetes.py
CHANGED
@@ -1,17 +1,18 @@
|
|
1
1
|
"""Kubernetes."""
|
2
|
-
import os
|
3
2
|
import re
|
3
|
+
import subprocess
|
4
|
+
import tempfile
|
4
5
|
import typing
|
5
6
|
from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
|
6
7
|
|
7
8
|
import colorama
|
8
9
|
|
10
|
+
from sky import catalog
|
9
11
|
from sky import clouds
|
10
12
|
from sky import exceptions
|
11
13
|
from sky import sky_logging
|
12
14
|
from sky import skypilot_config
|
13
15
|
from sky.adaptors import kubernetes
|
14
|
-
from sky.clouds import service_catalog
|
15
16
|
from sky.provision import instance_setup
|
16
17
|
from sky.provision.kubernetes import network_utils
|
17
18
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
@@ -28,10 +29,6 @@ if typing.TYPE_CHECKING:
|
|
28
29
|
|
29
30
|
logger = sky_logging.init_logger(__name__)
|
30
31
|
|
31
|
-
# Check if KUBECONFIG is set, and use it if it is.
|
32
|
-
DEFAULT_KUBECONFIG_PATH = '~/.kube/config'
|
33
|
-
CREDENTIAL_PATH = os.environ.get('KUBECONFIG', DEFAULT_KUBECONFIG_PATH)
|
34
|
-
|
35
32
|
# Namespace for SkyPilot resources shared across multiple tenants on the
|
36
33
|
# same cluster (even if they might be running in different namespaces).
|
37
34
|
# E.g., FUSE device manager daemonset is run in this namespace.
|
@@ -471,14 +468,14 @@ class Kubernetes(clouds.Cloud):
|
|
471
468
|
# Select image based on whether we are using GPUs or not.
|
472
469
|
image_id = self.IMAGE_GPU if acc_count > 0 else self.IMAGE_CPU
|
473
470
|
# Get the container image ID from the service catalog.
|
474
|
-
image_id =
|
475
|
-
|
471
|
+
image_id = catalog.get_image_id_from_tag(image_id,
|
472
|
+
clouds='kubernetes')
|
476
473
|
return image_id
|
477
474
|
|
478
475
|
image_id = _get_image_id(resources)
|
479
476
|
# TODO(romilb): Create a lightweight image for SSH jump host
|
480
|
-
ssh_jump_image =
|
481
|
-
|
477
|
+
ssh_jump_image = catalog.get_image_id_from_tag(self.IMAGE_CPU,
|
478
|
+
clouds='kubernetes')
|
482
479
|
|
483
480
|
# Set environment variables for the pod. Note that SkyPilot env vars
|
484
481
|
# are set separately when the task is run. These env vars are
|
@@ -788,6 +785,7 @@ class Kubernetes(clouds.Cloud):
|
|
788
785
|
"""Checks if the user has access credentials to
|
789
786
|
Kubernetes."""
|
790
787
|
# Check for port forward dependencies
|
788
|
+
logger.info(f'Checking compute credentials for {cls.canonical_name()}')
|
791
789
|
reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
|
792
790
|
if reasons is not None:
|
793
791
|
formatted = '\n'.join(
|
@@ -850,10 +848,24 @@ class Kubernetes(clouds.Cloud):
|
|
850
848
|
return ''.join(message_parts)
|
851
849
|
|
852
850
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
853
|
-
|
851
|
+
credential_paths = kubernetes_utils.get_kubeconfig_paths()
|
852
|
+
if credential_paths:
|
853
|
+
# For single kubeconfig path, keep the original path.
|
854
|
+
kubeconfig_file = credential_paths[0]
|
855
|
+
if len(credential_paths) > 1:
|
856
|
+
# For multiple kubeconfig paths, merge them into a single file.
|
857
|
+
# TODO(aylei): GC merged kubeconfig files.
|
858
|
+
kubeconfig_file = tempfile.NamedTemporaryFile(
|
859
|
+
prefix='merged-kubeconfig-', suffix='.yaml',
|
860
|
+
delete=False).name
|
861
|
+
subprocess.run(
|
862
|
+
'kubectl config view --flatten '
|
863
|
+
f'> {kubeconfig_file}',
|
864
|
+
shell=True,
|
865
|
+
check=True)
|
854
866
|
# Upload kubeconfig to the default path to avoid having to set
|
855
867
|
# KUBECONFIG in the environment.
|
856
|
-
return {DEFAULT_KUBECONFIG_PATH:
|
868
|
+
return {kubernetes.DEFAULT_KUBECONFIG_PATH: kubeconfig_file}
|
857
869
|
else:
|
858
870
|
return {}
|
859
871
|
|
sky/clouds/lambda_cloud.py
CHANGED
@@ -2,9 +2,9 @@
|
|
2
2
|
import typing
|
3
3
|
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
4
4
|
|
5
|
+
from sky import catalog
|
5
6
|
from sky import clouds
|
6
7
|
from sky.adaptors import common as adaptors_common
|
7
|
-
from sky.clouds import service_catalog
|
8
8
|
from sky.provision.lambda_cloud import lambda_utils
|
9
9
|
from sky.utils import registry
|
10
10
|
from sky.utils import resources_utils
|
@@ -73,7 +73,7 @@ class Lambda(clouds.Cloud):
|
|
73
73
|
del accelerators, zone # unused
|
74
74
|
if use_spot:
|
75
75
|
return []
|
76
|
-
regions =
|
76
|
+
regions = catalog.get_region_zones_for_instance_type(
|
77
77
|
instance_type, use_spot, 'lambda')
|
78
78
|
|
79
79
|
if region is not None:
|
@@ -105,11 +105,11 @@ class Lambda(clouds.Cloud):
|
|
105
105
|
use_spot: bool,
|
106
106
|
region: Optional[str] = None,
|
107
107
|
zone: Optional[str] = None) -> float:
|
108
|
-
return
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
108
|
+
return catalog.get_hourly_cost(instance_type,
|
109
|
+
use_spot=use_spot,
|
110
|
+
region=region,
|
111
|
+
zone=zone,
|
112
|
+
clouds='lambda')
|
113
113
|
|
114
114
|
def accelerators_to_hourly_cost(self,
|
115
115
|
accelerators: Dict[str, int],
|
@@ -133,26 +133,26 @@ class Lambda(clouds.Cloud):
|
|
133
133
|
memory: Optional[str] = None,
|
134
134
|
disk_tier: Optional['resources_utils.DiskTier'] = None
|
135
135
|
) -> Optional[str]:
|
136
|
-
return
|
137
|
-
|
138
|
-
|
139
|
-
|
136
|
+
return catalog.get_default_instance_type(cpus=cpus,
|
137
|
+
memory=memory,
|
138
|
+
disk_tier=disk_tier,
|
139
|
+
clouds='lambda')
|
140
140
|
|
141
141
|
@classmethod
|
142
142
|
def get_accelerators_from_instance_type(
|
143
143
|
cls,
|
144
144
|
instance_type: str,
|
145
145
|
) -> Optional[Dict[str, Union[int, float]]]:
|
146
|
-
return
|
147
|
-
|
146
|
+
return catalog.get_accelerators_from_instance_type(instance_type,
|
147
|
+
clouds='lambda')
|
148
148
|
|
149
149
|
@classmethod
|
150
150
|
def get_vcpus_mem_from_instance_type(
|
151
151
|
cls,
|
152
152
|
instance_type: str,
|
153
153
|
) -> Tuple[Optional[float], Optional[float]]:
|
154
|
-
return
|
155
|
-
|
154
|
+
return catalog.get_vcpus_mem_from_instance_type(instance_type,
|
155
|
+
clouds='lambda')
|
156
156
|
|
157
157
|
@classmethod
|
158
158
|
def get_zone_shell_cmd(cls) -> Optional[str]:
|
@@ -230,16 +230,16 @@ class Lambda(clouds.Cloud):
|
|
230
230
|
|
231
231
|
assert len(accelerators) == 1, resources
|
232
232
|
acc, acc_count = list(accelerators.items())[0]
|
233
|
-
(instance_list,
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
233
|
+
(instance_list,
|
234
|
+
fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
|
235
|
+
acc,
|
236
|
+
acc_count,
|
237
|
+
use_spot=resources.use_spot,
|
238
|
+
cpus=resources.cpus,
|
239
|
+
memory=resources.memory,
|
240
|
+
region=resources.region,
|
241
|
+
zone=resources.zone,
|
242
|
+
clouds='lambda')
|
243
243
|
if instance_list is None:
|
244
244
|
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
245
245
|
None)
|
@@ -278,16 +278,14 @@ class Lambda(clouds.Cloud):
|
|
278
278
|
return None
|
279
279
|
|
280
280
|
def instance_type_exists(self, instance_type: str) -> bool:
|
281
|
-
return
|
281
|
+
return catalog.instance_type_exists(instance_type, 'lambda')
|
282
282
|
|
283
283
|
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
284
|
-
return
|
285
|
-
zone,
|
286
|
-
clouds='lambda')
|
284
|
+
return catalog.validate_region_zone(region, zone, clouds='lambda')
|
287
285
|
|
288
286
|
@classmethod
|
289
287
|
def regions(cls) -> List['clouds.Region']:
|
290
|
-
return
|
288
|
+
return catalog.regions(clouds='lambda')
|
291
289
|
|
292
290
|
@classmethod
|
293
291
|
def query_status(cls, name: str, tag_filters: Dict[str, str],
|