skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""A script that generates the Shadeform catalog.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python fetch_shadeform.py [-h] [--api-key API_KEY]
|
|
5
|
+
[--api-key-path API_KEY_PATH]
|
|
6
|
+
|
|
7
|
+
If neither --api-key nor --api-key-path are provided, this script will parse
|
|
8
|
+
`~/.shadeform/api_key` to look for Shadeform API key.
|
|
9
|
+
"""
|
|
10
|
+
import argparse
|
|
11
|
+
import csv
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
from typing import Dict
|
|
15
|
+
|
|
16
|
+
import requests
|
|
17
|
+
|
|
18
|
+
ENDPOINT = 'https://api.shadeform.ai/v1/instances/types'
|
|
19
|
+
DEFAULT_SHADEFORM_API_KEY_PATH = os.path.expanduser('~/.shadeform/api_key')
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def parse_gpu_info(gpu_type: str, num_gpus: int, ram_per_gpu: int) -> Dict:
|
|
23
|
+
"""Parse GPU information for the catalog."""
|
|
24
|
+
|
|
25
|
+
manufacturer = 'NVIDIA'
|
|
26
|
+
if gpu_type == 'MI300X':
|
|
27
|
+
manufacturer = 'AMD'
|
|
28
|
+
elif gpu_type == 'GAUDI2':
|
|
29
|
+
manufacturer = 'Intel'
|
|
30
|
+
|
|
31
|
+
return {
|
|
32
|
+
'Gpus': [{
|
|
33
|
+
'Name': gpu_type,
|
|
34
|
+
'Manufacturer': manufacturer,
|
|
35
|
+
'Count': float(num_gpus),
|
|
36
|
+
'MemoryInfo': {
|
|
37
|
+
'SizeInMiB': ram_per_gpu
|
|
38
|
+
},
|
|
39
|
+
'TotalGpuMemoryInMiB': ram_per_gpu * num_gpus
|
|
40
|
+
}]
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def create_catalog(api_key: str, output_path: str) -> None:
|
|
45
|
+
"""Create Shadeform catalog by fetching from API."""
|
|
46
|
+
headers = {'X-API-KEY': api_key}
|
|
47
|
+
|
|
48
|
+
params = {'available': 'true'}
|
|
49
|
+
|
|
50
|
+
response = requests.get(ENDPOINT,
|
|
51
|
+
headers=headers,
|
|
52
|
+
params=params,
|
|
53
|
+
timeout=30)
|
|
54
|
+
response.raise_for_status()
|
|
55
|
+
|
|
56
|
+
data = response.json()
|
|
57
|
+
instance_types = data.get('instance_types', [])
|
|
58
|
+
|
|
59
|
+
with open(output_path, mode='w', encoding='utf-8') as f:
|
|
60
|
+
writer = csv.writer(f, delimiter=',', quotechar='"')
|
|
61
|
+
writer.writerow([
|
|
62
|
+
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs',
|
|
63
|
+
'MemoryGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
|
|
64
|
+
])
|
|
65
|
+
|
|
66
|
+
for instance in instance_types:
|
|
67
|
+
config = instance['configuration']
|
|
68
|
+
|
|
69
|
+
cloud = instance['cloud']
|
|
70
|
+
shade_instance_type = instance['shade_instance_type']
|
|
71
|
+
instance_type = f'{cloud}_{shade_instance_type.replace("_", "-")}'
|
|
72
|
+
gpu_type = config['gpu_type'].replace('_', '-')
|
|
73
|
+
gpu_count = float(config['num_gpus'])
|
|
74
|
+
vcpus = float(config['vcpus'])
|
|
75
|
+
memory_gb = int(config['memory_in_gb'])
|
|
76
|
+
|
|
77
|
+
# Append "B" to instance_type and gpu_type if they end with "G"
|
|
78
|
+
if instance_type.endswith('G'):
|
|
79
|
+
instance_type += 'B'
|
|
80
|
+
if gpu_type.endswith('G'):
|
|
81
|
+
gpu_type += 'B'
|
|
82
|
+
|
|
83
|
+
# Replace "Gx" with "GBx" (case sensitive)
|
|
84
|
+
if 'Gx' in instance_type:
|
|
85
|
+
instance_type = instance_type.replace('Gx', 'GBx')
|
|
86
|
+
|
|
87
|
+
# Price is in cents per hour, convert to dollars
|
|
88
|
+
price = float(instance['hourly_price']) / 100
|
|
89
|
+
|
|
90
|
+
# Create GPU info
|
|
91
|
+
gpuinfo = None
|
|
92
|
+
if gpu_count > 0:
|
|
93
|
+
gpuinfo_dict = parse_gpu_info(gpu_type, int(gpu_count),
|
|
94
|
+
int(config['vram_per_gpu_in_gb']))
|
|
95
|
+
gpuinfo = json.dumps(gpuinfo_dict).replace('"', '\'')
|
|
96
|
+
|
|
97
|
+
# Write entry for each available region
|
|
98
|
+
for availability in instance.get('availability', []):
|
|
99
|
+
if availability['available'] and gpu_count > 0:
|
|
100
|
+
region = availability['region']
|
|
101
|
+
writer.writerow([
|
|
102
|
+
instance_type,
|
|
103
|
+
gpu_type,
|
|
104
|
+
gpu_count,
|
|
105
|
+
vcpus,
|
|
106
|
+
memory_gb,
|
|
107
|
+
price,
|
|
108
|
+
region,
|
|
109
|
+
gpuinfo,
|
|
110
|
+
'' # No spot pricing info available
|
|
111
|
+
])
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def get_api_key(cmdline_args: argparse.Namespace) -> str:
|
|
115
|
+
"""Get Shadeform API key from cmdline or default path."""
|
|
116
|
+
api_key = cmdline_args.api_key
|
|
117
|
+
if api_key is None:
|
|
118
|
+
if cmdline_args.api_key_path is not None:
|
|
119
|
+
with open(cmdline_args.api_key_path, mode='r',
|
|
120
|
+
encoding='utf-8') as f:
|
|
121
|
+
api_key = f.read().strip()
|
|
122
|
+
else:
|
|
123
|
+
# Read from ~/.shadeform/api_key
|
|
124
|
+
with open(DEFAULT_SHADEFORM_API_KEY_PATH,
|
|
125
|
+
mode='r',
|
|
126
|
+
encoding='utf-8') as f:
|
|
127
|
+
api_key = f.read().strip()
|
|
128
|
+
assert api_key is not None, (
|
|
129
|
+
f'API key not found. Please provide via --api-key or place in '
|
|
130
|
+
f'{DEFAULT_SHADEFORM_API_KEY_PATH}')
|
|
131
|
+
return api_key
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
if __name__ == '__main__':
|
|
135
|
+
parser = argparse.ArgumentParser()
|
|
136
|
+
parser.add_argument('--api-key', help='Shadeform API key.')
|
|
137
|
+
parser.add_argument('--api-key-path',
|
|
138
|
+
help='path of file containing Shadeform API key.')
|
|
139
|
+
args = parser.parse_args()
|
|
140
|
+
os.makedirs('shadeform', exist_ok=True)
|
|
141
|
+
create_catalog(get_api_key(args), 'shadeform/vms.csv')
|
|
142
|
+
print('Shadeform catalog saved to shadeform/vms.csv')
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
Kubernetes does not require a catalog of instances, but we need an image catalog
|
|
4
4
|
mapping SkyPilot image tags to corresponding container image tags.
|
|
5
5
|
"""
|
|
6
|
+
import collections
|
|
6
7
|
import re
|
|
7
8
|
import typing
|
|
8
9
|
from typing import Dict, List, Optional, Set, Tuple
|
|
@@ -167,12 +168,25 @@ def _list_accelerators(
|
|
|
167
168
|
accelerators_qtys: Set[Tuple[str, int]] = set()
|
|
168
169
|
keys = lf.get_label_keys()
|
|
169
170
|
nodes = kubernetes_utils.get_kubernetes_nodes(context=context)
|
|
170
|
-
|
|
171
|
-
if
|
|
172
|
-
|
|
171
|
+
|
|
172
|
+
# Check if any nodes have accelerators before fetching pods
|
|
173
|
+
has_accelerator_nodes = False
|
|
174
|
+
for node in nodes:
|
|
175
|
+
for key in keys:
|
|
176
|
+
if key in node.metadata.labels:
|
|
177
|
+
has_accelerator_nodes = True
|
|
178
|
+
break
|
|
179
|
+
if has_accelerator_nodes:
|
|
180
|
+
break
|
|
181
|
+
|
|
182
|
+
# Only fetch pods if we have accelerator nodes and realtime is requested
|
|
183
|
+
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
184
|
+
error_on_get_allocated_gpu_qty_by_node = False
|
|
185
|
+
if realtime and has_accelerator_nodes:
|
|
186
|
+
# Get the allocated GPU quantity by each node
|
|
173
187
|
try:
|
|
174
|
-
|
|
175
|
-
context=context)
|
|
188
|
+
allocated_qty_by_node = (
|
|
189
|
+
kubernetes_utils.get_allocated_gpu_qty_by_node(context=context))
|
|
176
190
|
except kubernetes.api_exception() as e:
|
|
177
191
|
if e.status == 403:
|
|
178
192
|
logger.warning(
|
|
@@ -180,6 +194,7 @@ def _list_accelerators(
|
|
|
180
194
|
'(forbidden). Please check if your account has '
|
|
181
195
|
'necessary permissions to list pods. Realtime GPU '
|
|
182
196
|
'availability information may be incorrect.')
|
|
197
|
+
error_on_get_allocated_gpu_qty_by_node = True
|
|
183
198
|
else:
|
|
184
199
|
raise
|
|
185
200
|
# Total number of GPUs in the cluster
|
|
@@ -191,7 +206,6 @@ def _list_accelerators(
|
|
|
191
206
|
for node in nodes:
|
|
192
207
|
for key in keys:
|
|
193
208
|
if key in node.metadata.labels:
|
|
194
|
-
allocated_qty = 0
|
|
195
209
|
accelerator_name = lf.get_accelerator_from_label_value(
|
|
196
210
|
node.metadata.labels.get(key))
|
|
197
211
|
|
|
@@ -246,31 +260,13 @@ def _list_accelerators(
|
|
|
246
260
|
total_accelerators_capacity[
|
|
247
261
|
accelerator_name] += quantized_count
|
|
248
262
|
|
|
249
|
-
if
|
|
250
|
-
# If we can't get the
|
|
263
|
+
if error_on_get_allocated_gpu_qty_by_node:
|
|
264
|
+
# If we can't get the allocated GPU quantity by each node,
|
|
265
|
+
# we can't get the GPU usage.
|
|
251
266
|
total_accelerators_available[accelerator_name] = -1
|
|
252
267
|
continue
|
|
253
268
|
|
|
254
|
-
|
|
255
|
-
# Get all the pods running on the node
|
|
256
|
-
if (pod.spec.node_name == node.metadata.name and
|
|
257
|
-
pod.status.phase in ['Running', 'Pending']):
|
|
258
|
-
# Skip pods that should not count against GPU count
|
|
259
|
-
if (kubernetes_utils.
|
|
260
|
-
should_exclude_pod_from_gpu_allocation(pod)):
|
|
261
|
-
logger.debug(
|
|
262
|
-
f'Excluding pod '
|
|
263
|
-
f'{pod.metadata.name} from GPU count '
|
|
264
|
-
f'calculations on node {node.metadata.name}')
|
|
265
|
-
continue
|
|
266
|
-
# Iterate over all the containers in the pod and sum
|
|
267
|
-
# the GPU requests
|
|
268
|
-
for container in pod.spec.containers:
|
|
269
|
-
if container.resources.requests:
|
|
270
|
-
allocated_qty += (
|
|
271
|
-
kubernetes_utils.get_node_accelerator_count(
|
|
272
|
-
context, container.resources.requests))
|
|
273
|
-
|
|
269
|
+
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
274
270
|
accelerators_available = accelerator_count - allocated_qty
|
|
275
271
|
# Initialize the total_accelerators_available to make sure the
|
|
276
272
|
# key exists in the dictionary.
|
sky/catalog/runpod_catalog.py
CHANGED
|
@@ -12,7 +12,11 @@ from sky.catalog import common
|
|
|
12
12
|
if typing.TYPE_CHECKING:
|
|
13
13
|
from sky.clouds import cloud
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
# Runpod has no set updated schedule for their catalog. We pull the catalog
|
|
16
|
+
# every 7 hours to make sure we have the latest information.
|
|
17
|
+
_PULL_FREQUENCY_HOURS = 7
|
|
18
|
+
_df = common.read_catalog('runpod/vms.csv',
|
|
19
|
+
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
|
16
20
|
|
|
17
21
|
|
|
18
22
|
def instance_type_exists(instance_type: str) -> bool:
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
""" Shadeform | Catalog
|
|
2
|
+
|
|
3
|
+
This module loads pricing and instance information from the Shadeform API
|
|
4
|
+
and can be used to query instance types and pricing information for Shadeform.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import typing
|
|
8
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from sky.catalog import common
|
|
13
|
+
|
|
14
|
+
if typing.TYPE_CHECKING:
|
|
15
|
+
from sky.clouds import cloud
|
|
16
|
+
|
|
17
|
+
# We'll use dynamic fetching, so no static CSV file to load
|
|
18
|
+
_df = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_df():
|
|
22
|
+
"""Get the dataframe, fetching from API if needed."""
|
|
23
|
+
global _df
|
|
24
|
+
if _df is None:
|
|
25
|
+
# For now, we'll fall back to a minimal static catalog
|
|
26
|
+
# In a full implementation, this would call the Shadeform API
|
|
27
|
+
# to dynamically fetch the latest instance types and pricing
|
|
28
|
+
try:
|
|
29
|
+
df = common.read_catalog('shadeform/vms.csv')
|
|
30
|
+
except FileNotFoundError:
|
|
31
|
+
# If no static catalog exists, create an empty one
|
|
32
|
+
# This would be replaced with dynamic API fetching
|
|
33
|
+
_df = pd.DataFrame(columns=[
|
|
34
|
+
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs',
|
|
35
|
+
'MemoryGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
|
|
36
|
+
])
|
|
37
|
+
else:
|
|
38
|
+
df = df[df['InstanceType'].notna()]
|
|
39
|
+
if 'AcceleratorName' in df.columns:
|
|
40
|
+
df = df[df['AcceleratorName'].notna()]
|
|
41
|
+
df = df.assign(AcceleratorName=df['AcceleratorName'].astype(
|
|
42
|
+
str).str.strip())
|
|
43
|
+
_df = df.reset_index(drop=True)
|
|
44
|
+
return _df
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _is_not_found_error(err: ValueError) -> bool:
|
|
48
|
+
msg = str(err).lower()
|
|
49
|
+
return 'not found' in msg or 'not supported' in msg
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _call_or_default(func, default):
|
|
53
|
+
try:
|
|
54
|
+
return func()
|
|
55
|
+
except ValueError as err:
|
|
56
|
+
if _is_not_found_error(err):
|
|
57
|
+
return default
|
|
58
|
+
raise
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def instance_type_exists(instance_type: str) -> bool:
|
|
62
|
+
"""Check if an instance type exists."""
|
|
63
|
+
return common.instance_type_exists_impl(_get_df(), instance_type)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def validate_region_zone(
|
|
67
|
+
region: Optional[str],
|
|
68
|
+
zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
|
|
69
|
+
"""Validate region and zone for Shadeform."""
|
|
70
|
+
return common.validate_region_zone_impl('shadeform', _get_df(), region,
|
|
71
|
+
zone)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_hourly_cost(instance_type: str,
|
|
75
|
+
use_spot: bool = False,
|
|
76
|
+
region: Optional[str] = None,
|
|
77
|
+
zone: Optional[str] = None) -> float:
|
|
78
|
+
"""Returns the cost, or the cheapest cost among all zones for spot."""
|
|
79
|
+
# Shadeform doesn't support spot instances currently
|
|
80
|
+
if use_spot:
|
|
81
|
+
raise ValueError('Spot instances are not supported on Shadeform')
|
|
82
|
+
|
|
83
|
+
return common.get_hourly_cost_impl(_get_df(), instance_type, use_spot,
|
|
84
|
+
region, zone)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_vcpus_mem_from_instance_type(
|
|
88
|
+
instance_type: str) -> Tuple[Optional[float], Optional[float]]:
|
|
89
|
+
"""Get vCPUs and memory from instance type."""
|
|
90
|
+
return _call_or_default(
|
|
91
|
+
lambda: common.get_vcpus_mem_from_instance_type_impl(
|
|
92
|
+
_get_df(), instance_type), (None, None))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
96
|
+
memory: Optional[str] = None,
|
|
97
|
+
disk_tier: Optional[str] = None,
|
|
98
|
+
region: Optional[str] = None,
|
|
99
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
100
|
+
"""Get default instance type based on requirements."""
|
|
101
|
+
del disk_tier # Shadeform doesn't support custom disk tiers yet
|
|
102
|
+
return _call_or_default(
|
|
103
|
+
lambda: common.get_instance_type_for_cpus_mem_impl(
|
|
104
|
+
_get_df(), cpus, memory, region, zone), None)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_accelerators_from_instance_type(
|
|
108
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
|
109
|
+
"""Get accelerator information from instance type."""
|
|
110
|
+
return _call_or_default(
|
|
111
|
+
lambda: common.get_accelerators_from_instance_type_impl(
|
|
112
|
+
_get_df(), instance_type), None)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_instance_type_for_accelerator(
|
|
116
|
+
acc_name: str,
|
|
117
|
+
acc_count: int,
|
|
118
|
+
cpus: Optional[str] = None,
|
|
119
|
+
memory: Optional[str] = None,
|
|
120
|
+
use_spot: bool = False,
|
|
121
|
+
region: Optional[str] = None,
|
|
122
|
+
zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
|
|
123
|
+
"""Returns a list of instance types that have the given accelerator."""
|
|
124
|
+
if use_spot:
|
|
125
|
+
# Return empty lists since spot is not supported
|
|
126
|
+
return None, ['Spot instances are not supported on Shadeform']
|
|
127
|
+
|
|
128
|
+
return _call_or_default(
|
|
129
|
+
lambda: common.get_instance_type_for_accelerator_impl(
|
|
130
|
+
df=_get_df(),
|
|
131
|
+
acc_name=acc_name,
|
|
132
|
+
acc_count=acc_count,
|
|
133
|
+
cpus=cpus,
|
|
134
|
+
memory=memory,
|
|
135
|
+
use_spot=use_spot,
|
|
136
|
+
region=region,
|
|
137
|
+
zone=zone), (None, []))
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def get_region_zones_for_instance_type(instance_type: str,
|
|
141
|
+
use_spot: bool) -> List['cloud.Region']:
|
|
142
|
+
"""Get regions and zones for an instance type."""
|
|
143
|
+
if use_spot:
|
|
144
|
+
return [] # No spot support
|
|
145
|
+
|
|
146
|
+
df = _get_df()
|
|
147
|
+
df_filtered = df[df['InstanceType'] == instance_type]
|
|
148
|
+
return _call_or_default(
|
|
149
|
+
lambda: common.get_region_zones(df_filtered, use_spot), [])
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def list_accelerators(
|
|
153
|
+
gpus_only: bool,
|
|
154
|
+
name_filter: Optional[str],
|
|
155
|
+
region_filter: Optional[str],
|
|
156
|
+
quantity_filter: Optional[int],
|
|
157
|
+
case_sensitive: bool = True,
|
|
158
|
+
all_regions: bool = False,
|
|
159
|
+
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
|
160
|
+
"""Returns all instance types in Shadeform offering GPUs."""
|
|
161
|
+
del require_price # Unused.
|
|
162
|
+
return common.list_accelerators_impl('Shadeform', _get_df(), gpus_only,
|
|
163
|
+
name_filter, region_filter,
|
|
164
|
+
quantity_filter, case_sensitive,
|
|
165
|
+
all_regions)
|
sky/check.py
CHANGED
|
@@ -14,6 +14,7 @@ from sky import global_user_state
|
|
|
14
14
|
from sky import sky_logging
|
|
15
15
|
from sky import skypilot_config
|
|
16
16
|
from sky.adaptors import cloudflare
|
|
17
|
+
from sky.adaptors import coreweave
|
|
17
18
|
from sky.clouds import cloud as sky_cloud
|
|
18
19
|
from sky.skylet import constants
|
|
19
20
|
from sky.utils import common_utils
|
|
@@ -33,7 +34,8 @@ def _get_workspace_allowed_clouds(workspace: str) -> List[str]:
|
|
|
33
34
|
# clouds. Also validate names with get_cloud_tuple.
|
|
34
35
|
config_allowed_cloud_names = skypilot_config.get_nested(
|
|
35
36
|
('allowed_clouds',),
|
|
36
|
-
[repr(c) for c in registry.CLOUD_REGISTRY.values()] +
|
|
37
|
+
[repr(c) for c in registry.CLOUD_REGISTRY.values()] +
|
|
38
|
+
[cloudflare.NAME, coreweave.NAME])
|
|
37
39
|
# filter out the clouds that are disabled in the workspace config
|
|
38
40
|
workspace_disabled_clouds = []
|
|
39
41
|
for cloud in config_allowed_cloud_names:
|
|
@@ -81,7 +83,7 @@ def check_capabilities(
|
|
|
81
83
|
|
|
82
84
|
def get_all_clouds() -> Tuple[str, ...]:
|
|
83
85
|
return tuple([repr(c) for c in registry.CLOUD_REGISTRY.values()] +
|
|
84
|
-
[cloudflare.NAME])
|
|
86
|
+
[cloudflare.NAME, coreweave.NAME])
|
|
85
87
|
|
|
86
88
|
def _execute_check_logic_for_workspace(
|
|
87
89
|
current_workspace_name: str,
|
|
@@ -121,9 +123,12 @@ def check_capabilities(
|
|
|
121
123
|
cloud_name: str
|
|
122
124
|
) -> Tuple[str, Union[sky_clouds.Cloud, ModuleType]]:
|
|
123
125
|
# Validates cloud_name and returns a tuple of the cloud's name and
|
|
124
|
-
# the cloud object. Includes special handling for Cloudflare
|
|
126
|
+
# the cloud object. Includes special handling for Cloudflare and
|
|
127
|
+
# CoreWeave.
|
|
125
128
|
if cloud_name.lower().startswith('cloudflare'):
|
|
126
129
|
return cloudflare.NAME, cloudflare
|
|
130
|
+
elif cloud_name.lower().startswith('coreweave'):
|
|
131
|
+
return coreweave.NAME, coreweave
|
|
127
132
|
else:
|
|
128
133
|
cloud_obj = registry.CLOUD_REGISTRY.from_str(cloud_name)
|
|
129
134
|
assert cloud_obj is not None, f'Cloud {cloud_name!r} not found'
|
|
@@ -219,23 +224,24 @@ def check_capabilities(
|
|
|
219
224
|
# allowed_clouds in config.yaml, it will be disabled.
|
|
220
225
|
all_enabled_clouds: Set[str] = set()
|
|
221
226
|
for capability in capabilities:
|
|
222
|
-
# Cloudflare
|
|
223
|
-
# should not be inserted into the DB
|
|
224
|
-
# other code would error out when it's
|
|
225
|
-
# registry).
|
|
227
|
+
# Cloudflare and CoreWeave are not real clouds in
|
|
228
|
+
# registry.CLOUD_REGISTRY, and should not be inserted into the DB
|
|
229
|
+
# (otherwise `sky launch` and other code would error out when it's
|
|
230
|
+
# trying to look it up in the registry).
|
|
226
231
|
enabled_clouds_set = {
|
|
227
232
|
cloud for cloud, capabilities in enabled_clouds.items()
|
|
228
|
-
if capability in capabilities and
|
|
229
|
-
|
|
233
|
+
if capability in capabilities and not cloud.startswith(
|
|
234
|
+
'Cloudflare') and not cloud.startswith('CoreWeave')
|
|
230
235
|
}
|
|
231
236
|
disabled_clouds_set = {
|
|
232
237
|
cloud for cloud, capabilities in disabled_clouds.items()
|
|
233
|
-
if capability in capabilities and
|
|
234
|
-
|
|
238
|
+
if capability in capabilities and not cloud.startswith(
|
|
239
|
+
'Cloudflare') and not cloud.startswith('CoreWeave')
|
|
235
240
|
}
|
|
236
241
|
config_allowed_clouds_set = {
|
|
237
242
|
cloud for cloud in config_allowed_cloud_names
|
|
238
|
-
if not cloud.startswith('Cloudflare')
|
|
243
|
+
if not cloud.startswith('Cloudflare') and
|
|
244
|
+
not cloud.startswith('CoreWeave')
|
|
239
245
|
}
|
|
240
246
|
previously_enabled_clouds_set = {
|
|
241
247
|
repr(cloud)
|
|
@@ -430,6 +436,12 @@ def get_cloud_credential_file_mounts(
|
|
|
430
436
|
if r2_is_enabled:
|
|
431
437
|
r2_credential_mounts = cloudflare.get_credential_file_mounts()
|
|
432
438
|
file_mounts.update(r2_credential_mounts)
|
|
439
|
+
|
|
440
|
+
# Similarly, handle CoreWeave storage credentials
|
|
441
|
+
coreweave_is_enabled, _ = coreweave.check_storage_credentials()
|
|
442
|
+
if coreweave_is_enabled:
|
|
443
|
+
coreweave_credential_mounts = coreweave.get_credential_file_mounts()
|
|
444
|
+
file_mounts.update(coreweave_credential_mounts)
|
|
433
445
|
return file_mounts
|
|
434
446
|
|
|
435
447
|
|
|
@@ -494,7 +506,7 @@ def _print_checked_cloud(
|
|
|
494
506
|
style_str = f'{colorama.Fore.GREEN}{colorama.Style.NORMAL}'
|
|
495
507
|
status_msg = 'enabled'
|
|
496
508
|
capability_string = f'[{", ".join(enabled_capabilities)}]'
|
|
497
|
-
if verbose and cloud is not cloudflare:
|
|
509
|
+
if verbose and cloud is not cloudflare and cloud is not coreweave:
|
|
498
510
|
activated_account = cloud.get_active_user_identity_str()
|
|
499
511
|
if isinstance(cloud_tuple[1], (sky_clouds.SSH, sky_clouds.Kubernetes)):
|
|
500
512
|
detail_string = _format_context_details(cloud_tuple[1],
|