skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -238,6 +238,40 @@ def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
238
238
  return accelerator, 1
239
239
 
240
240
 
241
+ def _is_cloudflare_403_error(exception: Exception) -> bool:
242
+ """Check if an exception is a transient CloudFlare 403 error.
243
+
244
+ CloudFlare proxy 403 errors with CF-specific headers are transient and
245
+ should be retried, unlike real RBAC 403 errors.
246
+
247
+ Args:
248
+ exception: The exception to check
249
+
250
+ Returns:
251
+ True if this is a CloudFlare 403 error that should be retried
252
+ """
253
+ if not isinstance(exception, kubernetes.api_exception()):
254
+ return False
255
+
256
+ # Only check for 403 errors
257
+ if exception.status != 403:
258
+ return False
259
+
260
+ # Check for CloudFlare-specific headers
261
+ headers = exception.headers if hasattr(exception, 'headers') else {}
262
+ if not headers:
263
+ return False
264
+
265
+ # CloudFlare errors have CF-RAY header and/or Server: cloudflare
266
+ for k, v in headers.items():
267
+ if 'cf-ray' in k.lower():
268
+ return True
269
+ if 'server' in k.lower() and 'cloudflare' in str(v).lower():
270
+ return True
271
+
272
+ return False
273
+
274
+
241
275
  def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
242
276
  retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
243
277
  resource_type: Optional[str] = None):
@@ -272,19 +306,25 @@ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
272
306
  kubernetes.api_exception(),
273
307
  kubernetes.config_exception()) as e:
274
308
  last_exception = e
309
+
310
+ # Check if this is a CloudFlare transient 403 error
311
+ is_cloudflare_403 = _is_cloudflare_403_error(e)
312
+
275
313
  # Don't retry on permanent errors like 401 (Unauthorized)
276
- # or 403 (Forbidden)
314
+ # or 403 (Forbidden), unless it's a CloudFlare transient 403
277
315
  if (isinstance(e, kubernetes.api_exception()) and
278
- e.status in (401, 403)):
316
+ e.status in (401, 403) and not is_cloudflare_403):
279
317
  # Raise KubeAPIUnreachableError exception so that the
280
318
  # optimizer/provisioner can failover to other clouds.
281
319
  raise exceptions.KubeAPIUnreachableError(
282
320
  f'Kubernetes API error: {str(e)}') from e
283
321
  if attempt < max_retries - 1:
284
322
  sleep_time = backoff.current_backoff()
285
- logger.debug(f'Kubernetes API call {func.__name__} '
286
- f'failed with {str(e)}. Retrying in '
287
- f'{sleep_time:.1f}s...')
323
+ error_type = 'CloudFlare 403' if is_cloudflare_403 else 'error'
324
+ logger.debug(
325
+ f'Kubernetes API call {func.__name__} '
326
+ f'failed with {error_type} {str(e)}. Retrying in '
327
+ f'{sleep_time:.1f}s...')
288
328
  time.sleep(sleep_time)
289
329
  continue
290
330
 
@@ -696,6 +736,7 @@ def detect_gpu_label_formatter(
696
736
  for label, value in node.metadata.labels.items():
697
737
  node_labels[node.metadata.name].append((label, value))
698
738
 
739
+ invalid_label_values: List[Tuple[str, str, str, str]] = []
699
740
  # Check if the node labels contain any of the GPU label prefixes
700
741
  for lf in LABEL_FORMATTER_REGISTRY:
701
742
  skip = False
@@ -709,11 +750,8 @@ def detect_gpu_label_formatter(
709
750
  if valid:
710
751
  return lf(), node_labels
711
752
  else:
712
- logger.warning(f'GPU label {label} matched for label '
713
- f'formatter {lf.__class__.__name__}, '
714
- f'but has invalid value {value}. '
715
- f'Reason: {reason}. '
716
- 'Skipping...')
753
+ invalid_label_values.append(
754
+ (label, lf.__name__, value, reason))
717
755
  skip = True
718
756
  break
719
757
  if skip:
@@ -721,6 +759,13 @@ def detect_gpu_label_formatter(
721
759
  if skip:
722
760
  continue
723
761
 
762
+ for label, lf_name, value, reason in invalid_label_values:
763
+ logger.warning(f'GPU label {label} matched for label '
764
+ f'formatter {lf_name}, '
765
+ f'but has invalid value {value}. '
766
+ f'Reason: {reason}. '
767
+ 'Skipping...')
768
+
724
769
  return None, node_labels
725
770
 
726
771
 
@@ -1259,30 +1304,52 @@ class V1Pod:
1259
1304
 
1260
1305
 
1261
1306
  @_retry_on_error(resource_type='pod')
1262
- def get_all_pods_in_kubernetes_cluster(*,
1263
- context: Optional[str] = None
1264
- ) -> List[V1Pod]:
1265
- """Gets pods in all namespaces in kubernetes cluster indicated by context.
1266
-
1267
- Used for computing cluster resource usage.
1307
+ def get_allocated_gpu_qty_by_node(
1308
+ *,
1309
+ context: Optional[str] = None,
1310
+ ) -> Dict[str, int]:
1311
+ """Gets allocated GPU quantity by each node by fetching pods in
1312
+ all namespaces in kubernetes cluster indicated by context.
1268
1313
  """
1269
1314
  if context is None:
1270
1315
  context = get_current_kube_config_context_name()
1316
+ non_included_pod_statuses = POD_STATUSES.copy()
1317
+ status_filters = ['Running', 'Pending']
1318
+ if status_filters is not None:
1319
+ non_included_pod_statuses -= set(status_filters)
1320
+ field_selector = ','.join(
1321
+ [f'status.phase!={status}' for status in non_included_pod_statuses])
1271
1322
 
1272
1323
  # Return raw urllib3.HTTPResponse object so that we can parse the json
1273
1324
  # more efficiently.
1274
1325
  response = kubernetes.core_api(context).list_pod_for_all_namespaces(
1275
- _request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
1326
+ _request_timeout=kubernetes.API_TIMEOUT,
1327
+ _preload_content=False,
1328
+ field_selector=field_selector)
1276
1329
  try:
1277
- pods = [
1278
- V1Pod.from_dict(item_dict) for item_dict in ijson.items(
1279
- response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
1280
- ]
1330
+ allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
1331
+ for item_dict in ijson.items(response,
1332
+ 'items.item',
1333
+ buf_size=IJSON_BUFFER_SIZE):
1334
+ pod = V1Pod.from_dict(item_dict)
1335
+ if should_exclude_pod_from_gpu_allocation(pod):
1336
+ logger.debug(
1337
+ f'Excluding pod {pod.metadata.name} from GPU count '
1338
+ f'calculations on node {pod.spec.node_name}')
1339
+ continue
1340
+ # Iterate over all the containers in the pod and sum the
1341
+ # GPU requests
1342
+ pod_allocated_qty = 0
1343
+ for container in pod.spec.containers:
1344
+ if container.resources.requests:
1345
+ pod_allocated_qty += get_node_accelerator_count(
1346
+ context, container.resources.requests)
1347
+ if pod_allocated_qty > 0 and pod.spec.node_name:
1348
+ allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
1349
+ return allocated_qty_by_node
1281
1350
  finally:
1282
1351
  response.release_conn()
1283
1352
 
1284
- return pods
1285
-
1286
1353
 
1287
1354
  def check_instance_fits(context: Optional[str],
1288
1355
  instance: str) -> Tuple[bool, Optional[str]]:
@@ -2179,6 +2246,15 @@ def get_kube_config_context_namespace(
2179
2246
  return DEFAULT_NAMESPACE
2180
2247
 
2181
2248
 
2249
+ def parse_cpu_or_gpu_resource_to_float(resource_str: str) -> float:
2250
+ if not resource_str:
2251
+ return 0.0
2252
+ if resource_str[-1] == 'm':
2253
+ return float(resource_str[:-1]) / 1000
2254
+ else:
2255
+ return float(resource_str)
2256
+
2257
+
2182
2258
  def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
2183
2259
  resource_str = str(resource_qty_str)
2184
2260
  if resource_str[-1] == 'm':
@@ -2738,7 +2814,8 @@ def merge_custom_metadata(
2738
2814
  config_utils.merge_k8s_configs(original_metadata, custom_metadata)
2739
2815
 
2740
2816
 
2741
- def check_nvidia_runtime_class(context: Optional[str] = None) -> bool:
2817
+ @_retry_on_error(resource_type='runtimeclass')
2818
+ def check_nvidia_runtime_class(*, context: Optional[str] = None) -> bool:
2742
2819
  """Checks if the 'nvidia' RuntimeClass exists in the cluster"""
2743
2820
  # Fetch the list of available RuntimeClasses
2744
2821
  runtime_classes = kubernetes.node_api(context).list_runtime_class()
@@ -2965,41 +3042,24 @@ def get_kubernetes_node_info(
2965
3042
  label_keys = lf.get_label_keys()
2966
3043
 
2967
3044
  # Check if all nodes have no accelerators to avoid fetching pods
2968
- any_node_has_accelerators = False
3045
+ has_accelerator_nodes = False
2969
3046
  for node in nodes:
2970
3047
  accelerator_count = get_node_accelerator_count(context,
2971
3048
  node.status.allocatable)
2972
3049
  if accelerator_count > 0:
2973
- any_node_has_accelerators = True
3050
+ has_accelerator_nodes = True
2974
3051
  break
2975
3052
 
2976
- # Get the pods to get the real-time resource usage
2977
- pods = None
3053
+ # Get the allocated GPU quantity by each node
2978
3054
  allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
2979
- if any_node_has_accelerators:
3055
+ error_on_get_allocated_gpu_qty_by_node = False
3056
+ if has_accelerator_nodes:
2980
3057
  try:
2981
- pods = get_all_pods_in_kubernetes_cluster(context=context)
2982
- # Pre-compute allocated accelerator count per node
2983
- for pod in pods:
2984
- if pod.status.phase in ['Running', 'Pending']:
2985
- # Skip pods that should not count against GPU count
2986
- if should_exclude_pod_from_gpu_allocation(pod):
2987
- logger.debug(f'Excluding low priority pod '
2988
- f'{pod.metadata.name} from GPU allocation '
2989
- f'calculations')
2990
- continue
2991
- # Iterate over all the containers in the pod and sum the
2992
- # GPU requests
2993
- pod_allocated_qty = 0
2994
- for container in pod.spec.containers:
2995
- if container.resources.requests:
2996
- pod_allocated_qty += get_node_accelerator_count(
2997
- context, container.resources.requests)
2998
- if pod_allocated_qty > 0:
2999
- allocated_qty_by_node[
3000
- pod.spec.node_name] += pod_allocated_qty
3058
+ allocated_qty_by_node = get_allocated_gpu_qty_by_node(
3059
+ context=context)
3001
3060
  except kubernetes.api_exception() as e:
3002
3061
  if e.status == 403:
3062
+ error_on_get_allocated_gpu_qty_by_node = True
3003
3063
  pass
3004
3064
  else:
3005
3065
  raise
@@ -3044,7 +3104,7 @@ def get_kubernetes_node_info(
3044
3104
  ip_address=node_ip)
3045
3105
  continue
3046
3106
 
3047
- if pods is None:
3107
+ if not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
3048
3108
  accelerators_available = -1
3049
3109
  else:
3050
3110
  allocated_qty = allocated_qty_by_node[node.metadata.name]
@@ -3241,13 +3301,13 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
3241
3301
 
3242
3302
  try:
3243
3303
  pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
3244
- label_selector='skypilot-cluster',
3304
+ label_selector=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
3245
3305
  _request_timeout=kubernetes.API_TIMEOUT).items
3246
3306
  except kubernetes.max_retry_error():
3247
3307
  raise exceptions.ResourcesUnavailableError(
3248
3308
  'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
3249
3309
  'Please check if the cluster is healthy and retry. To debug, run: '
3250
- 'kubectl get pods --selector=skypilot-cluster --all-namespaces'
3310
+ 'kubectl get pods --selector=skypilot-cluster-name --all-namespaces'
3251
3311
  ) from None
3252
3312
  return pods
3253
3313
 
@@ -3384,7 +3444,8 @@ def process_skypilot_pods(
3384
3444
  serve_controllers: List[KubernetesSkyPilotClusterInfo] = []
3385
3445
 
3386
3446
  for pod in pods:
3387
- cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
3447
+ cluster_name_on_cloud = pod.metadata.labels.get(
3448
+ provision_constants.TAG_SKYPILOT_CLUSTER_NAME)
3388
3449
  cluster_name = cluster_name_on_cloud.rsplit(
3389
3450
  '-', 1
3390
3451
  )[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
@@ -5,6 +5,7 @@ from sky import global_user_state
5
5
  from sky import models
6
6
  from sky import sky_logging
7
7
  from sky.adaptors import kubernetes
8
+ from sky.provision import constants
8
9
  from sky.provision.kubernetes import config as config_lib
9
10
  from sky.provision.kubernetes import constants as k8s_constants
10
11
  from sky.provision.kubernetes import utils as kubernetes_utils
@@ -75,7 +76,6 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
75
76
  """Deletes a volume."""
76
77
  context, namespace = _get_context_namespace(config)
77
78
  pvc_name = config.name_on_cloud
78
- logger.info(f'Deleting PVC {pvc_name}')
79
79
  kubernetes_utils.delete_k8s_resource_with_retry(
80
80
  delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
81
81
  context).delete_namespaced_persistent_volume_claim(
@@ -84,6 +84,7 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
84
84
  _request_timeout=config_lib.DELETION_TIMEOUT),
85
85
  resource_type='pvc',
86
86
  resource_name=pvc_name)
87
+ logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
87
88
  return config
88
89
 
89
90
 
@@ -128,7 +129,7 @@ def _get_volume_usedby(
128
129
  usedby_pods.append(pod.metadata.name)
129
130
  # Get the real cluster name
130
131
  cluster_name_on_cloud = pod.metadata.labels.get(
131
- k8s_constants.TAG_SKYPILOT_CLUSTER_NAME)
132
+ constants.TAG_SKYPILOT_CLUSTER_NAME)
132
133
  if cluster_name_on_cloud is None:
133
134
  continue
134
135
  cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
@@ -205,7 +206,7 @@ def get_all_volumes_usedby(
205
206
  used_by_pods[context][namespace][volume_name].append(
206
207
  pod.metadata.name)
207
208
  cluster_name_on_cloud = pod.metadata.labels.get(
208
- k8s_constants.TAG_SKYPILOT_CLUSTER_NAME)
209
+ constants.TAG_SKYPILOT_CLUSTER_NAME)
209
210
  if cluster_name_on_cloud is None:
210
211
  continue
211
212
  cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
@@ -242,9 +243,9 @@ def create_persistent_volume_claim(namespace: str, context: Optional[str],
242
243
  except kubernetes.api_exception() as e:
243
244
  if e.status != 404: # Not found
244
245
  raise
245
- logger.info(f'Creating PVC {pvc_name}')
246
246
  kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
247
247
  namespace=namespace, body=pvc_spec)
248
+ logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
248
249
 
249
250
 
250
251
  def _get_pvc_spec(namespace: str,
@@ -232,9 +232,10 @@ def query_instances(
232
232
  cluster_name_on_cloud: str,
233
233
  provider_config: Optional[Dict[str, Any]] = None,
234
234
  non_terminated_only: bool = True,
235
+ retry_if_missing: bool = False,
235
236
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
236
237
  """See sky/provision/__init__.py"""
237
- del cluster_name # unused
238
+ del cluster_name, retry_if_missing # unused
238
239
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
239
240
  instances = _filter_instances(cluster_name_on_cloud, None)
240
241
 
@@ -254,9 +254,10 @@ def query_instances(
254
254
  cluster_name_on_cloud: str,
255
255
  provider_config: Optional[Dict[str, Any]] = None,
256
256
  non_terminated_only: bool = True,
257
+ retry_if_missing: bool = False,
257
258
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
258
259
  """See sky/provision/__init__.py"""
259
- del cluster_name # unused
260
+ del cluster_name, retry_if_missing # unused
260
261
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
261
262
  instances = _filter_instances(provider_config['region'],
262
263
  cluster_name_on_cloud, None)
@@ -36,6 +36,7 @@ def query_instances(
36
36
  cluster_name_on_cloud: str,
37
37
  provider_config: Optional[Dict[str, Any]] = None,
38
38
  non_terminated_only: bool = True,
39
+ retry_if_missing: bool = False,
39
40
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
40
41
  """Query instances.
41
42
 
@@ -44,7 +45,7 @@ def query_instances(
44
45
  A None status means the instance is marked as "terminated"
45
46
  or "terminating".
46
47
  """
47
- del cluster_name # unusedå
48
+ del cluster_name, retry_if_missing # unused
48
49
  assert provider_config is not None, cluster_name_on_cloud
49
50
  region = provider_config['region']
50
51
 
@@ -281,9 +281,10 @@ def query_instances(
281
281
  cluster_name_on_cloud: str,
282
282
  provider_config: Optional[Dict[str, Any]] = None,
283
283
  non_terminated_only: bool = True,
284
+ retry_if_missing: bool = False,
284
285
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
285
286
  """See sky/provision/__init__.py"""
286
- del cluster_name, non_terminated_only #unused
287
+ del cluster_name, non_terminated_only, retry_if_missing #unused
287
288
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
288
289
  instances = _filter_instances(cluster_name_on_cloud, None)
289
290
 
@@ -442,6 +442,14 @@ def _post_provision_setup(
442
442
  cluster_name.name_on_cloud,
443
443
  provider_config=provider_config)
444
444
 
445
+ # Update cluster info in handle so cluster instance ids are set. This
446
+ # allows us to expose provision logs to debug nodes that failed during post
447
+ # provision setup.
448
+ handle = global_user_state.get_handle_from_cluster_name(
449
+ cluster_name.display_name)
450
+ handle.cached_cluster_info = cluster_info
451
+ global_user_state.update_cluster_handle(cluster_name.display_name, handle)
452
+
445
453
  if cluster_info.num_instances > 1:
446
454
  # Only worker nodes have logs in the per-instance log directory. Head
447
455
  # node's log will be redirected to the main log file.
@@ -477,12 +485,13 @@ def _post_provision_setup(
477
485
  # ready by the provisioner, and we use kubectl instead of SSH to run the
478
486
  # commands and rsync on the pods. SSH will still be ready after a while
479
487
  # for the users to SSH into the pod.
480
- if cloud_name.lower() != 'kubernetes':
488
+ is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
489
+ if not is_k8s_cloud:
481
490
  logger.debug(
482
491
  f'\nWaiting for SSH to be available for {cluster_name!r} ...')
483
492
  wait_for_ssh(cluster_info, ssh_credentials)
484
493
  logger.debug(f'SSH Connection ready for {cluster_name!r}')
485
- vm_str = 'Instance' if cloud_name.lower() != 'kubernetes' else 'Pod'
494
+ vm_str = 'Instance' if not is_k8s_cloud else 'Pod'
486
495
  plural = '' if len(cluster_info.instances) == 1 else 's'
487
496
  verb = 'is' if len(cluster_info.instances) == 1 else 'are'
488
497
  indent_str = (ux_utils.INDENT_SYMBOL
@@ -222,9 +222,10 @@ def query_instances(
222
222
  cluster_name_on_cloud: str,
223
223
  provider_config: Optional[Dict[str, Any]] = None,
224
224
  non_terminated_only: bool = True,
225
+ retry_if_missing: bool = False,
225
226
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
226
227
  """See sky/provision/__init__.py"""
227
- del cluster_name # unused
228
+ del cluster_name, retry_if_missing # unused
228
229
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
229
230
  instances = _filter_instances(cluster_name_on_cloud, None)
230
231
 
@@ -431,8 +431,9 @@ def query_instances(
431
431
  cluster_name_on_cloud: str,
432
432
  provider_config: Optional[Dict[str, Any]] = None,
433
433
  non_terminated_only: bool = True,
434
+ retry_if_missing: bool = False,
434
435
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
435
- del cluster_name # unused
436
+ del cluster_name, retry_if_missing # unused
436
437
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
437
438
  instances = _filter_instances(cluster_name_on_cloud, None)
438
439
 
@@ -9,7 +9,6 @@ import subprocess
9
9
  import time
10
10
  from typing import Any, Dict, List, Optional, Tuple
11
11
 
12
- from sky import authentication as auth
13
12
  from sky import sky_logging
14
13
  from sky.adaptors import seeweb as seeweb_adaptor
15
14
  from sky.provision import common
@@ -17,6 +16,7 @@ from sky.provision.common import ClusterInfo
17
16
  from sky.provision.common import InstanceInfo
18
17
  from sky.provision.common import ProvisionConfig
19
18
  from sky.provision.common import ProvisionRecord
19
+ from sky.utils import auth_utils
20
20
  from sky.utils import command_runner # Unified SSH helper
21
21
  from sky.utils import common_utils
22
22
  from sky.utils import status_lib
@@ -75,7 +75,7 @@ class SeewebNodeProvider:
75
75
  if self.config and self.config.authentication_config:
76
76
  key_path = self.config.authentication_config.get('ssh_private_key')
77
77
  if not key_path:
78
- key_path, _ = auth.get_or_generate_keys()
78
+ key_path, _ = auth_utils.get_or_generate_keys()
79
79
  return os.path.expanduser(key_path)
80
80
 
81
81
  # ------------------------------------------------------------------ #
@@ -661,7 +661,7 @@ def _ping_server_standalone(server_ip: str) -> bool:
661
661
  def _check_ssh_ready_standalone(server_ip: str) -> bool:
662
662
  """Check that SSH is available on the server (standalone version)."""
663
663
  try:
664
- private_key_path, _ = auth.get_or_generate_keys()
664
+ private_key_path, _ = auth_utils.get_or_generate_keys()
665
665
  private_key_path = os.path.expanduser(private_key_path)
666
666
  ssh_user = 'ecuser'
667
667
  result = subprocess.run([
@@ -0,0 +1,11 @@
1
+ """Shadeform provisioner."""
2
+
3
+ from sky.provision.shadeform.config import bootstrap_instances
4
+ from sky.provision.shadeform.instance import cleanup_ports
5
+ from sky.provision.shadeform.instance import get_cluster_info
6
+ from sky.provision.shadeform.instance import open_ports
7
+ from sky.provision.shadeform.instance import query_instances
8
+ from sky.provision.shadeform.instance import run_instances
9
+ from sky.provision.shadeform.instance import stop_instances
10
+ from sky.provision.shadeform.instance import terminate_instances
11
+ from sky.provision.shadeform.instance import wait_instances
@@ -0,0 +1,12 @@
1
+ """Shadeform configuration bootstrapping."""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ """Bootstraps instances for the given cluster."""
10
+ del region, cluster_name # unused
11
+
12
+ return config