skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@ import json
5
5
  import re
6
6
  import sys
7
7
  import time
8
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
+ from typing import Any, Dict, List, Optional, Tuple, Union
9
9
 
10
10
  from sky import exceptions
11
11
  from sky import global_user_state
@@ -33,6 +33,9 @@ from sky.utils.db import db_utils
33
33
  POLL_INTERVAL = 2
34
34
  _TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
35
35
  _MAX_RETRIES = 3
36
+ _MAX_MISSING_PODS_RETRIES = 5
37
+ _MAX_QUERY_INSTANCES_RETRIES = 5
38
+ _QUERY_INSTANCES_RETRY_INTERVAL = .5
36
39
  _NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
37
40
 
38
41
  # Pattern to extract SSH user from command output, handling MOTD contamination
@@ -81,7 +84,7 @@ def is_high_availability_cluster_by_kubectl(
81
84
  context).list_namespaced_deployment(
82
85
  namespace,
83
86
  label_selector=
84
- f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
87
+ f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
85
88
  except kubernetes.api_exception():
86
89
  return False
87
90
  # It is a high availability cluster if there is at least one deployment
@@ -425,11 +428,11 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
425
428
  # Get all pods in a single API call using the cluster name label
426
429
  # which all pods in new_nodes should share
427
430
  cluster_name_on_cloud = new_nodes[0].metadata.labels[
428
- k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
431
+ constants.TAG_SKYPILOT_CLUSTER_NAME]
429
432
  pods = kubernetes.core_api(context).list_namespaced_pod(
430
433
  namespace,
431
434
  label_selector=
432
- f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
435
+ f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
433
436
  ).items
434
437
 
435
438
  # Get the set of found pod names and check if we have all expected pods
@@ -489,17 +492,17 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
489
492
 
490
493
 
491
494
  @timeline.event
492
- def _wait_for_pods_to_run(namespace, context, new_nodes):
495
+ def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
493
496
  """Wait for pods and their containers to be ready.
494
497
 
495
498
  Pods may be pulling images or may be in the process of container
496
499
  creation.
497
500
  """
498
- if not new_nodes:
501
+ if not new_pods:
499
502
  return
500
503
 
501
504
  # Create a set of pod names we're waiting for
502
- expected_pod_names = {node.metadata.name for node in new_nodes}
505
+ expected_pod_names = {pod.metadata.name for pod in new_pods}
503
506
 
504
507
  def _check_init_containers(pod):
505
508
  # Check if any of the init containers failed
@@ -526,28 +529,62 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
526
529
  'Failed to create init container for pod '
527
530
  f'{pod.metadata.name}. Error details: {msg}.')
528
531
 
532
+ missing_pods_retry = 0
529
533
  while True:
530
534
  # Get all pods in a single API call
531
- cluster_name = new_nodes[0].metadata.labels[
532
- k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
535
+ cluster_name_on_cloud = new_pods[0].metadata.labels[
536
+ constants.TAG_SKYPILOT_CLUSTER_NAME]
533
537
  all_pods = kubernetes.core_api(context).list_namespaced_pod(
534
538
  namespace,
535
539
  label_selector=
536
- f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
540
+ f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
541
+ ).items
537
542
 
538
543
  # Get the set of found pod names and check if we have all expected pods
539
544
  found_pod_names = {pod.metadata.name for pod in all_pods}
540
- missing_pods = expected_pod_names - found_pod_names
541
- if missing_pods:
545
+ missing_pod_names = expected_pod_names - found_pod_names
546
+ if missing_pod_names:
547
+ # In _wait_for_pods_to_schedule, we already wait for all pods to go
548
+ # from pending to scheduled. So if a pod is missing here, it means
549
+ # something unusual must have happened, and so should be treated as
550
+ # an exception.
551
+ # It is also only in _wait_for_pods_to_schedule that
552
+ # provision_timeout is used.
553
+ # TODO(kevin): Should we take provision_timeout into account here,
554
+ # instead of hardcoding the number of retries?
555
+ if missing_pods_retry >= _MAX_MISSING_PODS_RETRIES:
556
+ for pod_name in missing_pod_names:
557
+ reason = _get_pod_missing_reason(context, namespace,
558
+ cluster_name, pod_name)
559
+ logger.warning(f'Pod {pod_name} missing: {reason}')
560
+ raise config_lib.KubernetesError(
561
+ f'Failed to get all pods after {missing_pods_retry} '
562
+ f'retries. Some pods may have been terminated or failed '
563
+ f'unexpectedly. Run `sky logs --provision {cluster_name}` '
564
+ 'for more details.')
542
565
  logger.info('Retrying running pods check: '
543
- f'Missing pods: {missing_pods}')
566
+ f'Missing pods: {missing_pod_names}')
544
567
  time.sleep(0.5)
568
+ missing_pods_retry += 1
545
569
  continue
546
570
 
547
571
  all_pods_running = True
548
572
  for pod in all_pods:
549
573
  if pod.metadata.name not in expected_pod_names:
550
574
  continue
575
+
576
+ # Check if pod is terminated/preempted/failed.
577
+ if (pod.metadata.deletion_timestamp is not None or
578
+ pod.status.phase == 'Failed'):
579
+ # Get the reason and write to cluster events before
580
+ # the pod gets completely deleted from the API.
581
+ reason = _get_pod_termination_reason(pod, cluster_name)
582
+ logger.warning(f'Pod {pod.metadata.name} terminated: {reason}')
583
+ raise config_lib.KubernetesError(
584
+ f'Pod {pod.metadata.name} has terminated or failed '
585
+ f'unexpectedly. Run `sky logs --provision {cluster_name}` '
586
+ 'for more details.')
587
+
551
588
  # Continue if pod and all the containers within the
552
589
  # pod are successfully created and running.
553
590
  if pod.status.phase == 'Running' and all(
@@ -583,31 +620,6 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
583
620
  time.sleep(1)
584
621
 
585
622
 
586
- def _run_function_with_retries(func: Callable,
587
- operation_name: str,
588
- max_retries: int = _MAX_RETRIES,
589
- retry_delay: int = 5) -> Any:
590
- """Runs a function with retries on Kubernetes errors.
591
- Args:
592
- func: Function to retry
593
- operation_name: Name of the operation for logging
594
- max_retries: Maximum number of retry attempts
595
- retry_delay: Delay between retries in seconds
596
- Raises:
597
- The last exception encountered if all retries fail.
598
- """
599
- for attempt in range(max_retries + 1):
600
- try:
601
- return func()
602
- except config_lib.KubernetesError:
603
- if attempt < max_retries:
604
- logger.warning(f'Failed to {operation_name} - '
605
- f'retrying in {retry_delay} seconds.')
606
- time.sleep(retry_delay)
607
- else:
608
- raise
609
-
610
-
611
623
  @timeline.event
612
624
  def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
613
625
  """Pre-initialization step for SkyPilot pods.
@@ -902,7 +914,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
902
914
  else:
903
915
  pod_spec['metadata']['labels'] = tags
904
916
  pod_spec['metadata']['labels'].update(
905
- {k8s_constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
917
+ {constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
906
918
 
907
919
  terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
908
920
  ['Terminating'])
@@ -954,7 +966,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
954
966
  nvidia_runtime_exists = False
955
967
  try:
956
968
  nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
957
- context)
969
+ context=context)
958
970
  except kubernetes.kubernetes.client.ApiException as e:
959
971
  logger.warning('run_instances: Error occurred while checking for '
960
972
  f'nvidia RuntimeClass - '
@@ -984,12 +996,19 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
984
996
 
985
997
  def _create_resource_thread(i: int):
986
998
  pod_spec_copy = copy.deepcopy(pod_spec)
987
- if head_pod_name is None and i == 0:
988
- # First pod should be head if no head exists
989
- pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
990
- head_selector = _head_service_selector(cluster_name_on_cloud)
991
- pod_spec_copy['metadata']['labels'].update(head_selector)
992
- pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
999
+ # 0 is for head pod, while 1+ is for worker pods.
1000
+ if i == 0:
1001
+ if head_pod_name is None:
1002
+ # First pod should be head if no head exists
1003
+ pod_spec_copy['metadata']['labels'].update(
1004
+ constants.HEAD_NODE_TAGS)
1005
+ head_selector = _head_service_selector(cluster_name_on_cloud)
1006
+ pod_spec_copy['metadata']['labels'].update(head_selector)
1007
+ pod_spec_copy['metadata'][
1008
+ 'name'] = f'{cluster_name_on_cloud}-head'
1009
+ else:
1010
+ # If head pod already exists, we skip creating it.
1011
+ return
993
1012
  else:
994
1013
  # Worker pods
995
1014
  pod_spec_copy['metadata']['labels'].update(
@@ -1035,7 +1054,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
1035
1054
  'podAffinityTerm': {
1036
1055
  'labelSelector': {
1037
1056
  'matchExpressions': [{
1038
- 'key': k8s_constants.TAG_SKYPILOT_CLUSTER_NAME,
1057
+ 'key': constants.TAG_SKYPILOT_CLUSTER_NAME,
1039
1058
  'operator': 'In',
1040
1059
  'values': [cluster_name_on_cloud]
1041
1060
  }]
@@ -1130,9 +1149,16 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
1130
1149
  'and then up the cluster again.')
1131
1150
  raise exceptions.InconsistentHighAvailabilityError(message)
1132
1151
 
1133
- # Create pods in parallel
1134
- created_resources = subprocess_utils.run_in_parallel(
1135
- _create_resource_thread, list(range(to_start_count)), _NUM_THREADS)
1152
+ created_resources = []
1153
+ if to_start_count > 0:
1154
+ # Create pods in parallel.
1155
+ # Use `config.count` instead of `to_start_count` to keep the index of
1156
+ # the Pods consistent especially for the case where some Pods are down
1157
+ # due to node failure or manual termination, etc. and then launch
1158
+ # again to create the Pods back.
1159
+ # The existing Pods will be skipped in _create_resource_thread.
1160
+ created_resources = subprocess_utils.run_in_parallel(
1161
+ _create_resource_thread, list(range(config.count)), _NUM_THREADS)
1136
1162
 
1137
1163
  if to_create_deployment:
1138
1164
  deployments = copy.deepcopy(created_resources)
@@ -1180,7 +1206,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
1180
1206
  # fail early if there is an error
1181
1207
  logger.debug(f'run_instances: waiting for pods to be running (pulling '
1182
1208
  f'images): {[pod.metadata.name for pod in pods]}')
1183
- _wait_for_pods_to_run(namespace, context, pods)
1209
+ _wait_for_pods_to_run(namespace, context, cluster_name, pods)
1184
1210
  logger.debug(f'run_instances: all pods are scheduled and running: '
1185
1211
  f'{[pod.metadata.name for pod in pods]}')
1186
1212
 
@@ -1375,6 +1401,9 @@ def get_cluster_info(
1375
1401
  external_ip=None,
1376
1402
  ssh_port=port,
1377
1403
  tags=pod.metadata.labels,
1404
+ # TODO(hailong): `cluster.local` may need to be configurable
1405
+ # Service name is same as the pod name for now.
1406
+ internal_svc=f'{pod_name}.{namespace}.svc.cluster.local',
1378
1407
  )
1379
1408
  ]
1380
1409
  if _is_head(pod):
@@ -1413,6 +1442,13 @@ def get_cluster_info(
1413
1442
  logger.debug(
1414
1443
  f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
1415
1444
 
1445
+ # cpu_request may be a string like `100m`, need to parse and convert
1446
+ num_cpus = kubernetes_utils.parse_cpu_or_gpu_resource_to_float(cpu_request)
1447
+ # 'num-cpus' for ray must be an integer, but we should not set it to 0 if
1448
+ # cpus is <1.
1449
+ # Keep consistent with the logic in clouds/kubernetes.py
1450
+ str_cpus = str(max(int(num_cpus), 1))
1451
+
1416
1452
  return common.ClusterInfo(
1417
1453
  instances=pods,
1418
1454
  head_instance_id=head_pod_name,
@@ -1422,16 +1458,52 @@ def get_cluster_info(
1422
1458
  # problems for other pods.
1423
1459
  custom_ray_options={
1424
1460
  'object-store-memory': 500000000,
1425
- 'num-cpus': cpu_request,
1461
+ 'num-cpus': str_cpus,
1426
1462
  },
1427
1463
  provider_name='kubernetes',
1428
1464
  provider_config=provider_config)
1429
1465
 
1430
1466
 
1431
1467
  def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
1432
- """Get pod termination reason and write to cluster events."""
1433
- reasons = []
1468
+ """Get pod termination reason and write to cluster events.
1469
+
1470
+ Checks both pod conditions (for preemption/disruption) and
1471
+ container statuses (for exit codes/errors).
1472
+ """
1434
1473
  latest_timestamp = pod.status.start_time or datetime.datetime.min
1474
+ ready_state = 'Unknown'
1475
+ termination_reason = 'Terminated unexpectedly'
1476
+ container_reasons = []
1477
+
1478
+ # Check pod status conditions for high level overview.
1479
+ # No need to sort, as each condition.type will only appear once.
1480
+ for condition in pod.status.conditions:
1481
+ reason = condition.reason or 'Unknown reason'
1482
+ message = condition.message or ''
1483
+
1484
+ # Get last known readiness state.
1485
+ if condition.type == 'Ready':
1486
+ ready_state = f'{reason} ({message})' if message else reason
1487
+ # Kueue preemption, as defined in:
1488
+ # https://pkg.go.dev/sigs.k8s.io/kueue/pkg/controller/jobs/pod#pkg-constants
1489
+ elif condition.type == 'TerminationTarget':
1490
+ termination_reason = f'Preempted by Kueue: {reason}'
1491
+ if message:
1492
+ termination_reason += f' ({message})'
1493
+ # Generic disruption.
1494
+ elif condition.type == 'DisruptionTarget':
1495
+ termination_reason = f'Disrupted: {reason}'
1496
+ if message:
1497
+ termination_reason += f' ({message})'
1498
+
1499
+ if condition.last_transition_time is not None:
1500
+ latest_timestamp = max(latest_timestamp,
1501
+ condition.last_transition_time)
1502
+
1503
+ pod_reason = (f'{termination_reason}.\n'
1504
+ f'Last known state: {ready_state}.')
1505
+
1506
+ # Check container statuses for exit codes/errors
1435
1507
  if pod.status and pod.status.container_statuses:
1436
1508
  for container_status in pod.status.container_statuses:
1437
1509
  terminated = container_status.state.terminated
@@ -1446,18 +1518,15 @@ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
1446
1518
  if reason is None:
1447
1519
  # just in-case reason is None, have default for debugging
1448
1520
  reason = f'exit({exit_code})'
1449
- reasons.append(reason)
1450
- if terminated.finished_at > latest_timestamp:
1451
- latest_timestamp = terminated.finished_at
1521
+ container_reasons.append(reason)
1522
+ latest_timestamp = max(latest_timestamp, terminated.finished_at)
1452
1523
 
1453
1524
  # TODO (kyuds): later, if needed, query `last_state` too.
1454
1525
 
1455
- if not reasons:
1456
- return ''
1457
-
1458
1526
  # Normally we will have a single container per pod for skypilot
1459
1527
  # but doing this just in-case there are multiple containers.
1460
- pod_reason = ' | '.join(reasons)
1528
+ if container_reasons:
1529
+ pod_reason += f'\nContainer errors: {" | ".join(container_reasons)}'
1461
1530
 
1462
1531
  global_user_state.add_cluster_event(
1463
1532
  cluster_name,
@@ -1602,35 +1671,50 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1602
1671
  return failure_reason
1603
1672
 
1604
1673
 
1605
- def query_instances(
1606
- cluster_name: str,
1607
- cluster_name_on_cloud: str,
1608
- provider_config: Optional[Dict[str, Any]] = None,
1609
- non_terminated_only: bool = True
1610
- ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
1611
- # Mapping from pod phase to skypilot status. These are the only valid pod
1612
- # phases.
1613
- # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
1614
- status_map = {
1615
- 'Pending': status_lib.ClusterStatus.INIT,
1616
- 'Running': status_lib.ClusterStatus.UP,
1617
- 'Failed': status_lib.ClusterStatus.INIT,
1618
- 'Unknown': None,
1619
- 'Succeeded': None,
1620
- }
1621
-
1622
- assert provider_config is not None
1623
- namespace = kubernetes_utils.get_namespace_from_config(provider_config)
1624
- context = kubernetes_utils.get_context_from_config(provider_config)
1625
- is_ssh = context.startswith('ssh-') if context else False
1626
- identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
1627
-
1628
- # Get all the pods with the label skypilot-cluster: <cluster_name>
1674
+ def list_namespaced_pod(context: Optional[str], namespace: str,
1675
+ cluster_name_on_cloud: str, is_ssh: bool, identity: str,
1676
+ label_selector: str) -> List[Any]:
1677
+ # Get all the pods with the label skypilot-cluster-name: <cluster_name>
1629
1678
  try:
1630
- pods = kubernetes.core_api(context).list_namespaced_pod(
1679
+ # log the query parameters we pass to the k8s api
1680
+ logger.debug(f'Querying k8s api for pods:\n'
1681
+ f'context: {context}\n'
1682
+ f'namespace: {namespace}\n'
1683
+ f'label selector:`{label_selector}`.')
1684
+
1685
+ response = kubernetes.core_api(context).list_namespaced_pod(
1631
1686
  namespace,
1632
- label_selector=f'skypilot-cluster={cluster_name_on_cloud}',
1633
- _request_timeout=kubernetes.API_TIMEOUT).items
1687
+ label_selector=label_selector,
1688
+ _request_timeout=kubernetes.API_TIMEOUT)
1689
+
1690
+ # log PodList response info
1691
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1692
+ logger.debug(f'k8s api response for `{label_selector}`:\n'
1693
+ f'apiVersion={response.api_version}, '
1694
+ f'kind={response.kind},\n'
1695
+ f'metadata={response.metadata}')
1696
+
1697
+ pods = response.items
1698
+
1699
+ # log detailed Pod info
1700
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1701
+ logger.debug(f'k8s api response for `{label_selector}`: '
1702
+ f'len(pods)={len(pods)}')
1703
+ for pod in pods:
1704
+ logger.debug(f'k8s pod info for `{label_selector}`: '
1705
+ f'pod.apiVersion={pod.api_version}, '
1706
+ f'pod.kind={pod.kind}, \n'
1707
+ f'pod.name={pod.metadata.name}, '
1708
+ f'pod.namespace={pod.metadata.namespace}, \n'
1709
+ f'pod.labels={pod.metadata.labels}, \n'
1710
+ f'pod.annotations={pod.metadata.annotations}, \n'
1711
+ 'pod.creationTimestamp='
1712
+ f'{pod.metadata.creation_timestamp}, '
1713
+ 'pod.deletionTimestamp='
1714
+ f'{pod.metadata.deletion_timestamp}, \n'
1715
+ f'pod.status={pod.status}')
1716
+ return pods
1717
+
1634
1718
  except kubernetes.max_retry_error():
1635
1719
  with ux_utils.print_exception_no_traceback():
1636
1720
  if is_ssh:
@@ -1654,14 +1738,63 @@ def query_instances(
1654
1738
  f'Failed to query {identity} {cluster_name_on_cloud!r} '
1655
1739
  f'status: {common_utils.format_exception(e)}')
1656
1740
 
1741
+
1742
+ def query_instances(
1743
+ cluster_name: str,
1744
+ cluster_name_on_cloud: str,
1745
+ provider_config: Optional[Dict[str, Any]] = None,
1746
+ non_terminated_only: bool = True,
1747
+ retry_if_missing: bool = False,
1748
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
1749
+ # Mapping from pod phase to skypilot status. These are the only valid pod
1750
+ # phases.
1751
+ # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
1752
+ status_map = {
1753
+ 'Pending': status_lib.ClusterStatus.INIT,
1754
+ 'Running': status_lib.ClusterStatus.UP,
1755
+ 'Failed': status_lib.ClusterStatus.INIT,
1756
+ 'Unknown': None,
1757
+ 'Succeeded': None,
1758
+ }
1759
+
1760
+ assert provider_config is not None
1761
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
1762
+ context = kubernetes_utils.get_context_from_config(provider_config)
1763
+ is_ssh = context.startswith('ssh-') if context else False
1764
+ identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
1765
+ label_selector = (f'{constants.TAG_SKYPILOT_CLUSTER_NAME}='
1766
+ f'{cluster_name_on_cloud}')
1767
+
1768
+ attempts = 0
1769
+ pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
1770
+ is_ssh, identity, label_selector)
1771
+ # When we see no pods returned from the k8s api, we assume the pods have
1772
+ # been terminated by the user directly and mark the cluster as terminated
1773
+ # in the global user state.
1774
+ # We add retry logic here as an attempt to mitigate a leak caused by the
1775
+ # kubernetes api returning no pods despite the pods actually existing.
1776
+ while (retry_if_missing and not pods and
1777
+ attempts < _MAX_QUERY_INSTANCES_RETRIES):
1778
+ logger.debug(f'Retrying to query k8s api for {cluster_name_on_cloud} '
1779
+ f'{attempts}/{_MAX_QUERY_INSTANCES_RETRIES} times.'
1780
+ f'after {_QUERY_INSTANCES_RETRY_INTERVAL} seconds.')
1781
+ time.sleep(_QUERY_INSTANCES_RETRY_INTERVAL)
1782
+ attempts += 1
1783
+ pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
1784
+ is_ssh, identity, label_selector)
1785
+ if len(pods) > 0:
1786
+ logger.info(f'Found {len(pods)} pods for {label_selector} after'
1787
+ f'{attempts} retries.')
1788
+
1657
1789
  # Check if the pods are running or pending
1658
1790
  cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
1659
1791
  Optional[str]]] = {}
1660
1792
  for pod in pods:
1661
1793
  phase = pod.status.phase
1794
+ is_terminating = pod.metadata.deletion_timestamp is not None
1662
1795
  pod_status = status_map[phase]
1663
1796
  reason = None
1664
- if phase in ('Failed', 'Unknown'):
1797
+ if phase in ('Failed', 'Unknown') or is_terminating:
1665
1798
  reason = _get_pod_termination_reason(pod, cluster_name)
1666
1799
  logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
1667
1800
  if non_terminated_only and pod_status is None:
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional
4
4
  from sky import sky_logging
5
5
  from sky.adaptors import kubernetes
6
6
  from sky.provision import common
7
+ from sky.provision import constants as provision_constants
7
8
  from sky.provision.kubernetes import network_utils
8
9
  from sky.provision.kubernetes import utils as kubernetes_utils
9
10
  from sky.utils import kubernetes_enums
@@ -48,12 +49,14 @@ def _open_ports_using_loadbalancer(
48
49
  service_name = _LOADBALANCER_SERVICE_NAME.format(
49
50
  cluster_name_on_cloud=cluster_name_on_cloud)
50
51
  context = kubernetes_utils.get_context_from_config(provider_config)
52
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
53
+
51
54
  content = network_utils.fill_loadbalancer_template(
52
- namespace=provider_config.get('namespace', 'default'),
55
+ namespace=namespace,
53
56
  context=context,
54
57
  service_name=service_name,
55
58
  ports=ports,
56
- selector_key='skypilot-cluster',
59
+ selector_key=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
57
60
  selector_value=cluster_name_on_cloud,
58
61
  )
59
62
 
@@ -103,11 +106,11 @@ def _open_ports_using_ingress(
103
106
  # To avoid this, we change ingress creation into one object containing
104
107
  # multiple rules.
105
108
  content = network_utils.fill_ingress_template(
106
- namespace=provider_config.get('namespace', 'default'),
109
+ namespace=namespace,
107
110
  context=context,
108
111
  service_details=service_details,
109
112
  ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress',
110
- selector_key='skypilot-cluster',
113
+ selector_key=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
111
114
  selector_value=cluster_name_on_cloud,
112
115
  )
113
116
 
@@ -165,9 +168,10 @@ def _cleanup_ports_for_loadbalancer(
165
168
  # TODO(aylei): test coverage
166
169
  context = provider_config.get(
167
170
  'context', kubernetes_utils.get_current_kube_config_context_name())
171
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
168
172
  network_utils.delete_namespaced_service(
169
173
  context=context,
170
- namespace=provider_config.get('namespace', 'default'),
174
+ namespace=namespace,
171
175
  service_name=service_name,
172
176
  )
173
177
 
@@ -180,19 +184,19 @@ def _cleanup_ports_for_ingress(
180
184
  # Delete services for each port
181
185
  context = provider_config.get(
182
186
  'context', kubernetes_utils.get_current_kube_config_context_name())
187
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
183
188
  for port in ports:
184
189
  service_name = f'{cluster_name_on_cloud}--skypilot-svc--{port}'
185
190
  network_utils.delete_namespaced_service(
186
191
  context=context,
187
- namespace=provider_config.get('namespace',
188
- kubernetes_utils.DEFAULT_NAMESPACE),
192
+ namespace=namespace,
189
193
  service_name=service_name,
190
194
  )
191
195
 
192
196
  # Delete the single ingress used for all ports
193
197
  ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress'
194
198
  network_utils.delete_namespaced_ingress(
195
- namespace=kubernetes_utils.get_namespace_from_config(provider_config),
199
+ namespace=namespace,
196
200
  context=kubernetes_utils.get_context_from_config(provider_config),
197
201
  ingress_name=ingress_name,
198
202
  )