skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,142 @@
1
+ """A script that generates the Shadeform catalog.
2
+
3
+ Usage:
4
+ python fetch_shadeform.py [-h] [--api-key API_KEY]
5
+ [--api-key-path API_KEY_PATH]
6
+
7
+ If neither --api-key nor --api-key-path are provided, this script will parse
8
+ `~/.shadeform/api_key` to look for Shadeform API key.
9
+ """
10
+ import argparse
11
+ import csv
12
+ import json
13
+ import os
14
+ from typing import Dict
15
+
16
+ import requests
17
+
18
+ ENDPOINT = 'https://api.shadeform.ai/v1/instances/types'
19
+ DEFAULT_SHADEFORM_API_KEY_PATH = os.path.expanduser('~/.shadeform/api_key')
20
+
21
+
22
+ def parse_gpu_info(gpu_type: str, num_gpus: int, ram_per_gpu: int) -> Dict:
23
+ """Parse GPU information for the catalog."""
24
+
25
+ manufacturer = 'NVIDIA'
26
+ if gpu_type == 'MI300X':
27
+ manufacturer = 'AMD'
28
+ elif gpu_type == 'GAUDI2':
29
+ manufacturer = 'Intel'
30
+
31
+ return {
32
+ 'Gpus': [{
33
+ 'Name': gpu_type,
34
+ 'Manufacturer': manufacturer,
35
+ 'Count': float(num_gpus),
36
+ 'MemoryInfo': {
37
+ 'SizeInMiB': ram_per_gpu
38
+ },
39
+ 'TotalGpuMemoryInMiB': ram_per_gpu * num_gpus
40
+ }]
41
+ }
42
+
43
+
44
+ def create_catalog(api_key: str, output_path: str) -> None:
45
+ """Create Shadeform catalog by fetching from API."""
46
+ headers = {'X-API-KEY': api_key}
47
+
48
+ params = {'available': 'true'}
49
+
50
+ response = requests.get(ENDPOINT,
51
+ headers=headers,
52
+ params=params,
53
+ timeout=30)
54
+ response.raise_for_status()
55
+
56
+ data = response.json()
57
+ instance_types = data.get('instance_types', [])
58
+
59
+ with open(output_path, mode='w', encoding='utf-8') as f:
60
+ writer = csv.writer(f, delimiter=',', quotechar='"')
61
+ writer.writerow([
62
+ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs',
63
+ 'MemoryGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
64
+ ])
65
+
66
+ for instance in instance_types:
67
+ config = instance['configuration']
68
+
69
+ cloud = instance['cloud']
70
+ shade_instance_type = instance['shade_instance_type']
71
+ instance_type = f'{cloud}_{shade_instance_type.replace("_", "-")}'
72
+ gpu_type = config['gpu_type'].replace('_', '-')
73
+ gpu_count = float(config['num_gpus'])
74
+ vcpus = float(config['vcpus'])
75
+ memory_gb = int(config['memory_in_gb'])
76
+
77
+ # Append "B" to instance_type and gpu_type if they end with "G"
78
+ if instance_type.endswith('G'):
79
+ instance_type += 'B'
80
+ if gpu_type.endswith('G'):
81
+ gpu_type += 'B'
82
+
83
+ # Replace "Gx" with "GBx" (case sensitive)
84
+ if 'Gx' in instance_type:
85
+ instance_type = instance_type.replace('Gx', 'GBx')
86
+
87
+ # Price is in cents per hour, convert to dollars
88
+ price = float(instance['hourly_price']) / 100
89
+
90
+ # Create GPU info
91
+ gpuinfo = None
92
+ if gpu_count > 0:
93
+ gpuinfo_dict = parse_gpu_info(gpu_type, int(gpu_count),
94
+ int(config['vram_per_gpu_in_gb']))
95
+ gpuinfo = json.dumps(gpuinfo_dict).replace('"', '\'')
96
+
97
+ # Write entry for each available region
98
+ for availability in instance.get('availability', []):
99
+ if availability['available'] and gpu_count > 0:
100
+ region = availability['region']
101
+ writer.writerow([
102
+ instance_type,
103
+ gpu_type,
104
+ gpu_count,
105
+ vcpus,
106
+ memory_gb,
107
+ price,
108
+ region,
109
+ gpuinfo,
110
+ '' # No spot pricing info available
111
+ ])
112
+
113
+
114
+ def get_api_key(cmdline_args: argparse.Namespace) -> str:
115
+ """Get Shadeform API key from cmdline or default path."""
116
+ api_key = cmdline_args.api_key
117
+ if api_key is None:
118
+ if cmdline_args.api_key_path is not None:
119
+ with open(cmdline_args.api_key_path, mode='r',
120
+ encoding='utf-8') as f:
121
+ api_key = f.read().strip()
122
+ else:
123
+ # Read from ~/.shadeform/api_key
124
+ with open(DEFAULT_SHADEFORM_API_KEY_PATH,
125
+ mode='r',
126
+ encoding='utf-8') as f:
127
+ api_key = f.read().strip()
128
+ assert api_key is not None, (
129
+ f'API key not found. Please provide via --api-key or place in '
130
+ f'{DEFAULT_SHADEFORM_API_KEY_PATH}')
131
+ return api_key
132
+
133
+
134
+ if __name__ == '__main__':
135
+ parser = argparse.ArgumentParser()
136
+ parser.add_argument('--api-key', help='Shadeform API key.')
137
+ parser.add_argument('--api-key-path',
138
+ help='path of file containing Shadeform API key.')
139
+ args = parser.parse_args()
140
+ os.makedirs('shadeform', exist_ok=True)
141
+ create_catalog(get_api_key(args), 'shadeform/vms.csv')
142
+ print('Shadeform catalog saved to shadeform/vms.csv')
@@ -3,6 +3,7 @@
3
3
  Kubernetes does not require a catalog of instances, but we need an image catalog
4
4
  mapping SkyPilot image tags to corresponding container image tags.
5
5
  """
6
+ import collections
6
7
  import re
7
8
  import typing
8
9
  from typing import Dict, List, Optional, Set, Tuple
@@ -167,12 +168,25 @@ def _list_accelerators(
167
168
  accelerators_qtys: Set[Tuple[str, int]] = set()
168
169
  keys = lf.get_label_keys()
169
170
  nodes = kubernetes_utils.get_kubernetes_nodes(context=context)
170
- pods = None
171
- if realtime:
172
- # Get the pods to get the real-time GPU usage
171
+
172
+ # Check if any nodes have accelerators before fetching pods
173
+ has_accelerator_nodes = False
174
+ for node in nodes:
175
+ for key in keys:
176
+ if key in node.metadata.labels:
177
+ has_accelerator_nodes = True
178
+ break
179
+ if has_accelerator_nodes:
180
+ break
181
+
182
+ # Only fetch pods if we have accelerator nodes and realtime is requested
183
+ allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
184
+ error_on_get_allocated_gpu_qty_by_node = False
185
+ if realtime and has_accelerator_nodes:
186
+ # Get the allocated GPU quantity by each node
173
187
  try:
174
- pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(
175
- context=context)
188
+ allocated_qty_by_node = (
189
+ kubernetes_utils.get_allocated_gpu_qty_by_node(context=context))
176
190
  except kubernetes.api_exception() as e:
177
191
  if e.status == 403:
178
192
  logger.warning(
@@ -180,6 +194,7 @@ def _list_accelerators(
180
194
  '(forbidden). Please check if your account has '
181
195
  'necessary permissions to list pods. Realtime GPU '
182
196
  'availability information may be incorrect.')
197
+ error_on_get_allocated_gpu_qty_by_node = True
183
198
  else:
184
199
  raise
185
200
  # Total number of GPUs in the cluster
@@ -191,7 +206,6 @@ def _list_accelerators(
191
206
  for node in nodes:
192
207
  for key in keys:
193
208
  if key in node.metadata.labels:
194
- allocated_qty = 0
195
209
  accelerator_name = lf.get_accelerator_from_label_value(
196
210
  node.metadata.labels.get(key))
197
211
 
@@ -246,31 +260,13 @@ def _list_accelerators(
246
260
  total_accelerators_capacity[
247
261
  accelerator_name] += quantized_count
248
262
 
249
- if pods is None:
250
- # If we can't get the pods, we can't get the GPU usage
263
+ if error_on_get_allocated_gpu_qty_by_node:
264
+ # If we can't get the allocated GPU quantity by each node,
265
+ # we can't get the GPU usage.
251
266
  total_accelerators_available[accelerator_name] = -1
252
267
  continue
253
268
 
254
- for pod in pods:
255
- # Get all the pods running on the node
256
- if (pod.spec.node_name == node.metadata.name and
257
- pod.status.phase in ['Running', 'Pending']):
258
- # Skip pods that should not count against GPU count
259
- if (kubernetes_utils.
260
- should_exclude_pod_from_gpu_allocation(pod)):
261
- logger.debug(
262
- f'Excluding pod '
263
- f'{pod.metadata.name} from GPU count '
264
- f'calculations on node {node.metadata.name}')
265
- continue
266
- # Iterate over all the containers in the pod and sum
267
- # the GPU requests
268
- for container in pod.spec.containers:
269
- if container.resources.requests:
270
- allocated_qty += (
271
- kubernetes_utils.get_node_accelerator_count(
272
- context, container.resources.requests))
273
-
269
+ allocated_qty = allocated_qty_by_node[node.metadata.name]
274
270
  accelerators_available = accelerator_count - allocated_qty
275
271
  # Initialize the total_accelerators_available to make sure the
276
272
  # key exists in the dictionary.
@@ -12,7 +12,11 @@ from sky.catalog import common
12
12
  if typing.TYPE_CHECKING:
13
13
  from sky.clouds import cloud
14
14
 
15
- _df = common.read_catalog('runpod/vms.csv')
15
+ # Runpod has no set updated schedule for their catalog. We pull the catalog
16
+ # every 7 hours to make sure we have the latest information.
17
+ _PULL_FREQUENCY_HOURS = 7
18
+ _df = common.read_catalog('runpod/vms.csv',
19
+ pull_frequency_hours=_PULL_FREQUENCY_HOURS)
16
20
 
17
21
 
18
22
  def instance_type_exists(instance_type: str) -> bool:
@@ -0,0 +1,165 @@
1
+ """ Shadeform | Catalog
2
+
3
+ This module loads pricing and instance information from the Shadeform API
4
+ and can be used to query instance types and pricing information for Shadeform.
5
+ """
6
+
7
+ import typing
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+
10
+ import pandas as pd
11
+
12
+ from sky.catalog import common
13
+
14
+ if typing.TYPE_CHECKING:
15
+ from sky.clouds import cloud
16
+
17
+ # We'll use dynamic fetching, so no static CSV file to load
18
+ _df = None
19
+
20
+
21
+ def _get_df():
22
+ """Get the dataframe, fetching from API if needed."""
23
+ global _df
24
+ if _df is None:
25
+ # For now, we'll fall back to a minimal static catalog
26
+ # In a full implementation, this would call the Shadeform API
27
+ # to dynamically fetch the latest instance types and pricing
28
+ try:
29
+ df = common.read_catalog('shadeform/vms.csv')
30
+ except FileNotFoundError:
31
+ # If no static catalog exists, create an empty one
32
+ # This would be replaced with dynamic API fetching
33
+ _df = pd.DataFrame(columns=[
34
+ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs',
35
+ 'MemoryGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
36
+ ])
37
+ else:
38
+ df = df[df['InstanceType'].notna()]
39
+ if 'AcceleratorName' in df.columns:
40
+ df = df[df['AcceleratorName'].notna()]
41
+ df = df.assign(AcceleratorName=df['AcceleratorName'].astype(
42
+ str).str.strip())
43
+ _df = df.reset_index(drop=True)
44
+ return _df
45
+
46
+
47
+ def _is_not_found_error(err: ValueError) -> bool:
48
+ msg = str(err).lower()
49
+ return 'not found' in msg or 'not supported' in msg
50
+
51
+
52
+ def _call_or_default(func, default):
53
+ try:
54
+ return func()
55
+ except ValueError as err:
56
+ if _is_not_found_error(err):
57
+ return default
58
+ raise
59
+
60
+
61
+ def instance_type_exists(instance_type: str) -> bool:
62
+ """Check if an instance type exists."""
63
+ return common.instance_type_exists_impl(_get_df(), instance_type)
64
+
65
+
66
+ def validate_region_zone(
67
+ region: Optional[str],
68
+ zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
69
+ """Validate region and zone for Shadeform."""
70
+ return common.validate_region_zone_impl('shadeform', _get_df(), region,
71
+ zone)
72
+
73
+
74
+ def get_hourly_cost(instance_type: str,
75
+ use_spot: bool = False,
76
+ region: Optional[str] = None,
77
+ zone: Optional[str] = None) -> float:
78
+ """Returns the cost, or the cheapest cost among all zones for spot."""
79
+ # Shadeform doesn't support spot instances currently
80
+ if use_spot:
81
+ raise ValueError('Spot instances are not supported on Shadeform')
82
+
83
+ return common.get_hourly_cost_impl(_get_df(), instance_type, use_spot,
84
+ region, zone)
85
+
86
+
87
+ def get_vcpus_mem_from_instance_type(
88
+ instance_type: str) -> Tuple[Optional[float], Optional[float]]:
89
+ """Get vCPUs and memory from instance type."""
90
+ return _call_or_default(
91
+ lambda: common.get_vcpus_mem_from_instance_type_impl(
92
+ _get_df(), instance_type), (None, None))
93
+
94
+
95
+ def get_default_instance_type(cpus: Optional[str] = None,
96
+ memory: Optional[str] = None,
97
+ disk_tier: Optional[str] = None,
98
+ region: Optional[str] = None,
99
+ zone: Optional[str] = None) -> Optional[str]:
100
+ """Get default instance type based on requirements."""
101
+ del disk_tier # Shadeform doesn't support custom disk tiers yet
102
+ return _call_or_default(
103
+ lambda: common.get_instance_type_for_cpus_mem_impl(
104
+ _get_df(), cpus, memory, region, zone), None)
105
+
106
+
107
+ def get_accelerators_from_instance_type(
108
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
109
+ """Get accelerator information from instance type."""
110
+ return _call_or_default(
111
+ lambda: common.get_accelerators_from_instance_type_impl(
112
+ _get_df(), instance_type), None)
113
+
114
+
115
+ def get_instance_type_for_accelerator(
116
+ acc_name: str,
117
+ acc_count: int,
118
+ cpus: Optional[str] = None,
119
+ memory: Optional[str] = None,
120
+ use_spot: bool = False,
121
+ region: Optional[str] = None,
122
+ zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
123
+ """Returns a list of instance types that have the given accelerator."""
124
+ if use_spot:
125
+ # Return empty lists since spot is not supported
126
+ return None, ['Spot instances are not supported on Shadeform']
127
+
128
+ return _call_or_default(
129
+ lambda: common.get_instance_type_for_accelerator_impl(
130
+ df=_get_df(),
131
+ acc_name=acc_name,
132
+ acc_count=acc_count,
133
+ cpus=cpus,
134
+ memory=memory,
135
+ use_spot=use_spot,
136
+ region=region,
137
+ zone=zone), (None, []))
138
+
139
+
140
+ def get_region_zones_for_instance_type(instance_type: str,
141
+ use_spot: bool) -> List['cloud.Region']:
142
+ """Get regions and zones for an instance type."""
143
+ if use_spot:
144
+ return [] # No spot support
145
+
146
+ df = _get_df()
147
+ df_filtered = df[df['InstanceType'] == instance_type]
148
+ return _call_or_default(
149
+ lambda: common.get_region_zones(df_filtered, use_spot), [])
150
+
151
+
152
+ def list_accelerators(
153
+ gpus_only: bool,
154
+ name_filter: Optional[str],
155
+ region_filter: Optional[str],
156
+ quantity_filter: Optional[int],
157
+ case_sensitive: bool = True,
158
+ all_regions: bool = False,
159
+ require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
160
+ """Returns all instance types in Shadeform offering GPUs."""
161
+ del require_price # Unused.
162
+ return common.list_accelerators_impl('Shadeform', _get_df(), gpus_only,
163
+ name_filter, region_filter,
164
+ quantity_filter, case_sensitive,
165
+ all_regions)
sky/check.py CHANGED
@@ -14,6 +14,7 @@ from sky import global_user_state
14
14
  from sky import sky_logging
15
15
  from sky import skypilot_config
16
16
  from sky.adaptors import cloudflare
17
+ from sky.adaptors import coreweave
17
18
  from sky.clouds import cloud as sky_cloud
18
19
  from sky.skylet import constants
19
20
  from sky.utils import common_utils
@@ -33,7 +34,8 @@ def _get_workspace_allowed_clouds(workspace: str) -> List[str]:
33
34
  # clouds. Also validate names with get_cloud_tuple.
34
35
  config_allowed_cloud_names = skypilot_config.get_nested(
35
36
  ('allowed_clouds',),
36
- [repr(c) for c in registry.CLOUD_REGISTRY.values()] + [cloudflare.NAME])
37
+ [repr(c) for c in registry.CLOUD_REGISTRY.values()] +
38
+ [cloudflare.NAME, coreweave.NAME])
37
39
  # filter out the clouds that are disabled in the workspace config
38
40
  workspace_disabled_clouds = []
39
41
  for cloud in config_allowed_cloud_names:
@@ -81,7 +83,7 @@ def check_capabilities(
81
83
 
82
84
  def get_all_clouds() -> Tuple[str, ...]:
83
85
  return tuple([repr(c) for c in registry.CLOUD_REGISTRY.values()] +
84
- [cloudflare.NAME])
86
+ [cloudflare.NAME, coreweave.NAME])
85
87
 
86
88
  def _execute_check_logic_for_workspace(
87
89
  current_workspace_name: str,
@@ -121,9 +123,12 @@ def check_capabilities(
121
123
  cloud_name: str
122
124
  ) -> Tuple[str, Union[sky_clouds.Cloud, ModuleType]]:
123
125
  # Validates cloud_name and returns a tuple of the cloud's name and
124
- # the cloud object. Includes special handling for Cloudflare.
126
+ # the cloud object. Includes special handling for Cloudflare and
127
+ # CoreWeave.
125
128
  if cloud_name.lower().startswith('cloudflare'):
126
129
  return cloudflare.NAME, cloudflare
130
+ elif cloud_name.lower().startswith('coreweave'):
131
+ return coreweave.NAME, coreweave
127
132
  else:
128
133
  cloud_obj = registry.CLOUD_REGISTRY.from_str(cloud_name)
129
134
  assert cloud_obj is not None, f'Cloud {cloud_name!r} not found'
@@ -219,23 +224,24 @@ def check_capabilities(
219
224
  # allowed_clouds in config.yaml, it will be disabled.
220
225
  all_enabled_clouds: Set[str] = set()
221
226
  for capability in capabilities:
222
- # Cloudflare is not a real cloud in registry.CLOUD_REGISTRY, and
223
- # should not be inserted into the DB (otherwise `sky launch` and
224
- # other code would error out when it's trying to look it up in the
225
- # registry).
227
+ # Cloudflare and CoreWeave are not real clouds in
228
+ # registry.CLOUD_REGISTRY, and should not be inserted into the DB
229
+ # (otherwise `sky launch` and other code would error out when it's
230
+ # trying to look it up in the registry).
226
231
  enabled_clouds_set = {
227
232
  cloud for cloud, capabilities in enabled_clouds.items()
228
- if capability in capabilities and
229
- not cloud.startswith('Cloudflare')
233
+ if capability in capabilities and not cloud.startswith(
234
+ 'Cloudflare') and not cloud.startswith('CoreWeave')
230
235
  }
231
236
  disabled_clouds_set = {
232
237
  cloud for cloud, capabilities in disabled_clouds.items()
233
- if capability in capabilities and
234
- not cloud.startswith('Cloudflare')
238
+ if capability in capabilities and not cloud.startswith(
239
+ 'Cloudflare') and not cloud.startswith('CoreWeave')
235
240
  }
236
241
  config_allowed_clouds_set = {
237
242
  cloud for cloud in config_allowed_cloud_names
238
- if not cloud.startswith('Cloudflare')
243
+ if not cloud.startswith('Cloudflare') and
244
+ not cloud.startswith('CoreWeave')
239
245
  }
240
246
  previously_enabled_clouds_set = {
241
247
  repr(cloud)
@@ -430,6 +436,12 @@ def get_cloud_credential_file_mounts(
430
436
  if r2_is_enabled:
431
437
  r2_credential_mounts = cloudflare.get_credential_file_mounts()
432
438
  file_mounts.update(r2_credential_mounts)
439
+
440
+ # Similarly, handle CoreWeave storage credentials
441
+ coreweave_is_enabled, _ = coreweave.check_storage_credentials()
442
+ if coreweave_is_enabled:
443
+ coreweave_credential_mounts = coreweave.get_credential_file_mounts()
444
+ file_mounts.update(coreweave_credential_mounts)
433
445
  return file_mounts
434
446
 
435
447
 
@@ -494,7 +506,7 @@ def _print_checked_cloud(
494
506
  style_str = f'{colorama.Fore.GREEN}{colorama.Style.NORMAL}'
495
507
  status_msg = 'enabled'
496
508
  capability_string = f'[{", ".join(enabled_capabilities)}]'
497
- if verbose and cloud is not cloudflare:
509
+ if verbose and cloud is not cloudflare and cloud is not coreweave:
498
510
  activated_account = cloud.get_active_user_identity_str()
499
511
  if isinstance(cloud_tuple[1], (sky_clouds.SSH, sky_clouds.Kubernetes)):
500
512
  detail_string = _format_context_details(cloud_tuple[1],