skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/utils/dag_utils.py CHANGED
@@ -3,16 +3,16 @@ import copy
3
3
  from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import dag as dag_lib
6
- from sky import jobs
7
6
  from sky import sky_logging
8
7
  from sky import task as task_lib
9
- from sky.backends import backend_utils
8
+ from sky.utils import cluster_utils
10
9
  from sky.utils import common_utils
10
+ from sky.utils import registry
11
11
  from sky.utils import ux_utils
12
12
 
13
13
  logger = sky_logging.init_logger(__name__)
14
14
 
15
- # Message thrown when APIs sky.{exec,launch,spot.launch}() received a string
15
+ # Message thrown when APIs sky.{exec,launch,jobs.launch}() received a string
16
16
  # instead of a Dag. CLI (cli.py) is implemented by us so should not trigger
17
17
  # this.
18
18
  _ENTRYPOINT_STRING_AS_DAG_MESSAGE = """\
@@ -31,54 +31,43 @@ The command can then be run as:
31
31
 
32
32
  sky.launch(task, ...)
33
33
 
34
- sky.spot.launch(task, ...)
34
+ sky.jobs.launch(task, ...)
35
35
  """.strip()
36
36
 
37
37
 
38
38
  def convert_entrypoint_to_dag(entrypoint: Any) -> 'dag_lib.Dag':
39
- """Convert the entrypoint to a sky.Dag.
39
+ """Converts the entrypoint to a sky.Dag and applies the policy.
40
40
 
41
41
  Raises TypeError if 'entrypoint' is not a 'sky.Task' or 'sky.Dag'.
42
42
  """
43
43
  # Not suppressing stacktrace: when calling this via API user may want to
44
44
  # see their own program in the stacktrace. Our CLI impl would not trigger
45
45
  # these errors.
46
+ converted_dag: 'dag_lib.Dag'
46
47
  if isinstance(entrypoint, str):
47
48
  with ux_utils.print_exception_no_traceback():
48
49
  raise TypeError(_ENTRYPOINT_STRING_AS_DAG_MESSAGE)
49
50
  elif isinstance(entrypoint, dag_lib.Dag):
50
- return copy.deepcopy(entrypoint)
51
+ converted_dag = copy.deepcopy(entrypoint)
51
52
  elif isinstance(entrypoint, task_lib.Task):
52
53
  entrypoint = copy.deepcopy(entrypoint)
53
54
  with dag_lib.Dag() as dag:
54
55
  dag.add(entrypoint)
55
56
  dag.name = entrypoint.name
56
- return dag
57
+ converted_dag = dag
57
58
  else:
58
59
  with ux_utils.print_exception_no_traceback():
59
60
  raise TypeError(
60
61
  'Expected a sky.Task or sky.Dag but received argument of type: '
61
62
  f'{type(entrypoint)}')
62
63
 
64
+ return converted_dag
63
65
 
64
- def load_chain_dag_from_yaml(
65
- path: str,
66
- env_overrides: Optional[List[Tuple[str, str]]] = None,
67
- ) -> dag_lib.Dag:
68
- """Loads a chain DAG from a YAML file.
69
-
70
- Has special handling for an initial section in YAML that contains only the
71
- 'name' field, which is the DAG name.
72
66
 
73
- 'env_overrides' is a list of (key, value) pairs that will be used to update
74
- the task's 'envs' section. If it is a chain dag, the envs will be updated
75
- for all tasks in the chain.
76
-
77
- Returns:
78
- A chain Dag with 1 or more tasks (an empty entrypoint would create a
79
- trivial task).
80
- """
81
- configs = common_utils.read_yaml_all(path)
67
+ def _load_chain_dag(
68
+ configs: List[Dict[str, Any]],
69
+ env_overrides: Optional[List[Tuple[str, str]]] = None) -> dag_lib.Dag:
70
+ """Loads a chain DAG from a list of YAML configs."""
82
71
  dag_name = None
83
72
  if set(configs[0].keys()) == {'name'}:
84
73
  dag_name = configs[0]['name']
@@ -86,7 +75,7 @@ def load_chain_dag_from_yaml(
86
75
  elif len(configs) == 1:
87
76
  dag_name = configs[0].get('name')
88
77
 
89
- if len(configs) == 0:
78
+ if not configs:
90
79
  # YAML has only `name: xxx`. Still instantiate a task.
91
80
  configs = [{'name': dag_name}]
92
81
 
@@ -103,12 +92,74 @@ def load_chain_dag_from_yaml(
103
92
  return dag
104
93
 
105
94
 
106
- def dump_chain_dag_to_yaml(dag: dag_lib.Dag, path: str) -> None:
95
+ def load_chain_dag_from_yaml(
96
+ path: str,
97
+ env_overrides: Optional[List[Tuple[str, str]]] = None,
98
+ ) -> dag_lib.Dag:
99
+ """Loads a chain DAG from a YAML file.
100
+
101
+ Has special handling for an initial section in YAML that contains only the
102
+ 'name' field, which is the DAG name.
103
+
104
+ 'env_overrides' is a list of (key, value) pairs that will be used to update
105
+ the task's 'envs' section. If it is a chain dag, the envs will be updated
106
+ for all tasks in the chain.
107
+
108
+ Returns:
109
+ A chain Dag with 1 or more tasks (an empty entrypoint would create a
110
+ trivial task).
111
+ """
112
+ configs = common_utils.read_yaml_all(path)
113
+ return _load_chain_dag(configs, env_overrides)
114
+
115
+
116
+ def load_chain_dag_from_yaml_str(
117
+ yaml_str: str,
118
+ env_overrides: Optional[List[Tuple[str, str]]] = None,
119
+ ) -> dag_lib.Dag:
120
+ """Loads a chain DAG from a YAML string.
121
+
122
+ Has special handling for an initial section in YAML that contains only the
123
+ 'name' field, which is the DAG name.
124
+
125
+ 'env_overrides' is a list of (key, value) pairs that will be used to update
126
+ the task's 'envs' section. If it is a chain dag, the envs will be updated
127
+ for all tasks in the chain.
128
+
129
+ Returns:
130
+ A chain Dag with 1 or more tasks (an empty entrypoint would create a
131
+ trivial task).
132
+ """
133
+ configs = common_utils.read_yaml_all_str(yaml_str)
134
+ return _load_chain_dag(configs, env_overrides)
135
+
136
+
137
+ def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag) -> str:
138
+ """Dumps a chain DAG to a YAML string.
139
+
140
+ Args:
141
+ dag: the DAG to dump.
142
+
143
+ Returns:
144
+ The YAML string.
145
+ """
107
146
  assert dag.is_chain(), dag
108
147
  configs = [{'name': dag.name}]
109
148
  for task in dag.tasks:
110
149
  configs.append(task.to_yaml_config())
111
- common_utils.dump_yaml(path, configs)
150
+ return common_utils.dump_yaml_str(configs)
151
+
152
+
153
+ def dump_chain_dag_to_yaml(dag: dag_lib.Dag, path: str) -> None:
154
+ """Dumps a chain DAG to a YAML file.
155
+
156
+ Args:
157
+ dag: the DAG to dump.
158
+ path: the path to the YAML file.
159
+ """
160
+ dag_str = dump_chain_dag_to_yaml_str(dag)
161
+ with open(path, 'w', encoding='utf-8') as f:
162
+ f.write(dag_str)
112
163
 
113
164
 
114
165
  def maybe_infer_and_fill_dag_and_task_names(dag: dag_lib.Dag) -> None:
@@ -125,7 +176,7 @@ def maybe_infer_and_fill_dag_and_task_names(dag: dag_lib.Dag) -> None:
125
176
  dag.name = first_task.name
126
177
 
127
178
  if dag.name is None:
128
- dag.name = backend_utils.generate_cluster_name()
179
+ dag.name = cluster_utils.generate_cluster_name()
129
180
 
130
181
  if len(dag.tasks) == 1:
131
182
  if first_task.name is None:
@@ -140,11 +191,21 @@ def fill_default_config_in_dag_for_job_launch(dag: dag_lib.Dag) -> None:
140
191
  for task_ in dag.tasks:
141
192
 
142
193
  new_resources_list = []
194
+ default_strategy = registry.JOBS_RECOVERY_STRATEGY_REGISTRY.default
195
+ assert default_strategy is not None
143
196
  for resources in list(task_.resources):
144
- change_default_value: Dict[str, Any] = {}
145
- if resources.job_recovery is None:
146
- change_default_value[
147
- 'job_recovery'] = jobs.DEFAULT_RECOVERY_STRATEGY
197
+ original_job_recovery = resources.job_recovery
198
+ job_recovery = {'strategy': default_strategy}
199
+ if isinstance(original_job_recovery, str):
200
+ job_recovery['strategy'] = original_job_recovery
201
+ elif isinstance(original_job_recovery, dict):
202
+ job_recovery.update(original_job_recovery)
203
+ strategy = job_recovery.get('strategy')
204
+ if strategy is None:
205
+ job_recovery['strategy'] = default_strategy
206
+ change_default_value: Dict[str, Any] = {
207
+ 'job_recovery': job_recovery
208
+ }
148
209
 
149
210
  new_resources = resources.copy(**change_default_value)
150
211
  new_resources_list.append(new_resources)
sky/utils/db_utils.py CHANGED
@@ -4,11 +4,27 @@ import sqlite3
4
4
  import threading
5
5
  from typing import Any, Callable, Optional
6
6
 
7
+ # This parameter (passed to sqlite3.connect) controls how long we will wait to
8
+ # obtains a database lock (not necessarily during connection, but whenever it is
9
+ # needed). It is not a connection timeout.
10
+ # Even in WAL mode, only a single writer is allowed at a time. Other writers
11
+ # will block until the write lock can be obtained. This behavior is described in
12
+ # the SQLite documentation for WAL: https://www.sqlite.org/wal.html
13
+ # Python's default timeout is 5s. In normal usage, lock contention is very low,
14
+ # and this is more than sufficient. However, in some highly concurrent cases,
15
+ # such as a jobs controller suddenly recovering thousands of jobs at once, we
16
+ # can see a small number of processes that take much longer to obtain the lock.
17
+ # In contrived highly contentious cases, around 0.1% of transactions will take
18
+ # >30s to take the lock. We have not seen cases that take >60s. For cases up to
19
+ # 1000x parallelism, this is thus thought to be a conservative setting.
20
+ # For more info, see the PR description for #4552.
21
+ _DB_TIMEOUT_S = 60
22
+
7
23
 
8
24
  @contextlib.contextmanager
9
25
  def safe_cursor(db_path: str):
10
26
  """A newly created, auto-committing, auto-closing cursor."""
11
- conn = sqlite3.connect(db_path)
27
+ conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
12
28
  cursor = conn.cursor()
13
29
  try:
14
30
  yield cursor
@@ -79,8 +95,6 @@ class SQLiteConn(threading.local):
79
95
  def __init__(self, db_path: str, create_table: Callable):
80
96
  super().__init__()
81
97
  self.db_path = db_path
82
- # NOTE: We use a timeout of 10 seconds to avoid database locked
83
- # errors. This is a hack, but it works.
84
- self.conn = sqlite3.connect(db_path, timeout=10)
98
+ self.conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
85
99
  self.cursor = self.conn.cursor()
86
100
  create_table(self.cursor, self.conn)
sky/utils/env_options.py CHANGED
@@ -1,21 +1,43 @@
1
1
  """Global environment options for sky."""
2
2
  import enum
3
3
  import os
4
+ from typing import Dict
4
5
 
5
6
 
6
7
  class Options(enum.Enum):
7
8
  """Environment variables for SkyPilot."""
8
- IS_DEVELOPER = 'SKYPILOT_DEV'
9
- SHOW_DEBUG_INFO = 'SKYPILOT_DEBUG'
10
- DISABLE_LOGGING = 'SKYPILOT_DISABLE_USAGE_COLLECTION'
11
- MINIMIZE_LOGGING = 'SKYPILOT_MINIMIZE_LOGGING'
9
+
10
+ # (env var name, default value)
11
+ IS_DEVELOPER = ('SKYPILOT_DEV', False)
12
+ SHOW_DEBUG_INFO = ('SKYPILOT_DEBUG', False)
13
+ DISABLE_LOGGING = ('SKYPILOT_DISABLE_USAGE_COLLECTION', False)
14
+ MINIMIZE_LOGGING = ('SKYPILOT_MINIMIZE_LOGGING', True)
15
+ SUPPRESS_SENSITIVE_LOG = ('SKYPILOT_SUPPRESS_SENSITIVE_LOG', False)
12
16
  # Internal: this is used to skip the cloud user identity check, which is
13
17
  # used to protect cluster operations in a multi-identity scenario.
14
18
  # Currently, this is only used in the job and serve controller, as there
15
19
  # will not be multiple identities, and skipping the check can increase
16
20
  # robustness.
17
- SKIP_CLOUD_IDENTITY_CHECK = 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK'
21
+ SKIP_CLOUD_IDENTITY_CHECK = ('SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK', False)
22
+
23
+ def __init__(self, env_var: str, default: bool) -> None:
24
+ self.env_var = env_var
25
+ self.default = default
26
+
27
+ def __repr__(self) -> str:
28
+ return self.env_var
18
29
 
19
- def get(self):
30
+ def get(self) -> bool:
20
31
  """Check if an environment variable is set to True."""
21
- return os.getenv(self.value, 'False').lower() in ('true', '1')
32
+ return os.getenv(self.env_var,
33
+ str(self.default)).lower() in ('true', '1')
34
+
35
+ @property
36
+ def env_key(self) -> str:
37
+ """The environment variable key name."""
38
+ return self.value[0]
39
+
40
+ @classmethod
41
+ def all_options(cls) -> Dict[str, bool]:
42
+ """Returns all options as a dictionary."""
43
+ return {option.env_key: option.get() for option in list(Options)}
@@ -12,9 +12,11 @@ IMAGE_GPU="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:l
12
12
  PORT_RANGE_START=30000
13
13
  PORT_RANGE_END=30100
14
14
 
15
+ USER_HASH=$1
16
+
15
17
  # Check for GPU flag
16
18
  ENABLE_GPUS=false
17
- if [[ "$1" == "--gpus" ]]; then
19
+ if [[ "$2" == "--gpus" ]]; then
18
20
  ENABLE_GPUS=true
19
21
  fi
20
22
 
@@ -88,45 +90,20 @@ if kind get clusters | grep -q skypilot; then
88
90
  fi
89
91
 
90
92
  # Generate cluster YAML
91
- echo "Generating /tmp/skypilot-kind.yaml"
93
+ YAML_PATH="/tmp/skypilot-kind-$USER_HASH.yaml"
94
+ echo "Generating $YAML_PATH"
92
95
 
93
96
  # Add GPUs flag to the generate_kind_config.py command if GPUs are enabled
94
97
  if $ENABLE_GPUS; then
95
- python -m sky.utils.kubernetes.generate_kind_config --path /tmp/skypilot-kind.yaml --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END} --gpus
98
+ python -m sky.utils.kubernetes.generate_kind_config --path $YAML_PATH --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END} --gpus
96
99
  else
97
- python -m sky.utils.kubernetes.generate_kind_config --path /tmp/skypilot-kind.yaml --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END}
100
+ python -m sky.utils.kubernetes.generate_kind_config --path $YAML_PATH --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END}
98
101
  fi
99
102
 
100
- kind create cluster --config /tmp/skypilot-kind.yaml --name skypilot
103
+ kind create cluster --config $YAML_PATH --name skypilot
101
104
 
102
105
  echo "Kind cluster created."
103
106
 
104
- # Function to wait for SkyPilot GPU labeling jobs to complete
105
- wait_for_gpu_labeling_jobs() {
106
- echo "Starting wait for SkyPilot GPU labeling jobs to complete..."
107
-
108
- SECONDS=0
109
- TIMEOUT=600 # 10 minutes in seconds
110
-
111
- while true; do
112
- TOTAL_JOBS=$(kubectl get jobs -n kube-system -l job=sky-gpu-labeler --no-headers | wc -l)
113
- COMPLETED_JOBS=$(kubectl get jobs -n kube-system -l job=sky-gpu-labeler --no-headers | grep "1/1" | wc -l)
114
-
115
- if [[ $COMPLETED_JOBS -eq $TOTAL_JOBS ]]; then
116
- echo "All SkyPilot GPU labeling jobs completed ($TOTAL_JOBS)."
117
- break
118
- elif [ $SECONDS -ge $TIMEOUT ]; then
119
- echo "Timeout reached while waiting for GPU labeling jobs."
120
- exit 1
121
- else
122
- echo "Waiting for GPU labeling jobs to complete... ($COMPLETED_JOBS/$TOTAL_JOBS completed)"
123
- echo "To check status, see GPU labeling pods:"
124
- echo "kubectl get jobs -n kube-system -l job=sky-gpu-labeler"
125
- sleep 5
126
- fi
127
- done
128
- }
129
-
130
107
  # Function to wait for GPU operator to be correctly installed
131
108
  wait_for_gpu_operator_installation() {
132
109
  echo "Starting wait for GPU operator installation..."
@@ -150,22 +127,6 @@ wait_for_gpu_operator_installation() {
150
127
  done
151
128
  }
152
129
 
153
- wait_for_skypilot_gpu_image_pull() {
154
- echo "Pulling SkyPilot GPU image..."
155
- docker pull ${IMAGE_GPU}
156
- echo "Loading SkyPilot GPU image into kind cluster..."
157
- kind load docker-image --name skypilot ${IMAGE_GPU}
158
- echo "SkyPilot GPU image loaded into kind cluster."
159
- }
160
-
161
- wait_for_skypilot_cpu_image_pull() {
162
- echo "Pulling SkyPilot CPU image..."
163
- docker pull ${IMAGE}
164
- echo "Loading SkyPilot CPU image into kind cluster..."
165
- kind load docker-image --name skypilot ${IMAGE}
166
- echo "SkyPilot CPU image loaded into kind cluster."
167
- }
168
-
169
130
  wait_for_nginx_ingress_controller_install() {
170
131
  echo "Starting installation of Nginx Ingress Controller..."
171
132
 
@@ -206,21 +167,8 @@ if $ENABLE_GPUS; then
206
167
  nvidia/gpu-operator --set driver.enabled=false
207
168
  # Wait for GPU operator installation to succeed
208
169
  wait_for_gpu_operator_installation
209
-
210
- # Load the SkyPilot GPU image into the cluster for faster labelling
211
- wait_for_skypilot_gpu_image_pull
212
-
213
- # Label nodes with GPUs
214
- echo "Labelling nodes with GPUs..."
215
- python -m sky.utils.kubernetes.gpu_labeler
216
-
217
- # Wait for all the GPU labeling jobs to complete
218
- wait_for_gpu_labeling_jobs
219
170
  fi
220
171
 
221
- # Load local skypilot image on to the cluster for faster startup
222
- wait_for_skypilot_cpu_image_pull
223
-
224
172
  # Install the Nginx Ingress Controller
225
173
  wait_for_nginx_ingress_controller_install
226
174
 
@@ -0,0 +1,243 @@
1
+ #!/bin/bash
2
+ # Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script.
3
+ set -e
4
+
5
+ # Colors for nicer UX
6
+ RED='\033[0;31m'
7
+ GREEN='\033[0;32m'
8
+ YELLOW='\033[1;33m'
9
+ NC='\033[0m' # No color
10
+
11
+ # Variables
12
+ IPS_FILE=$1
13
+ USER=$2
14
+ SSH_KEY=$3
15
+ K3S_TOKEN=mytoken # Any string can be used as the token
16
+ CLEANUP=false
17
+ INSTALL_GPU=false
18
+
19
+ if [[ "$4" == "--cleanup" ]]; then
20
+ CLEANUP=true
21
+ fi
22
+
23
+ # Basic argument checks
24
+ if [ -z "$IPS_FILE" ] || [ -z "$USER" ] || [ -z "$SSH_KEY" ]; then
25
+ >&2 echo -e "${RED}Error: Missing required arguments.${NC}"
26
+ >&2 echo "Usage: ./deploy_remote_cluster.sh ips.txt username path/to/ssh/key [--cleanup]"
27
+ exit 1
28
+ fi
29
+
30
+ # Check if SSH key exists
31
+ if [ ! -f "$SSH_KEY" ]; then
32
+ >&2 echo -e "${RED}Error: SSH key not found: $SSH_KEY${NC}"
33
+ exit 1
34
+ fi
35
+
36
+ # Check if IPs file exists
37
+ if [ ! -f "$IPS_FILE" ]; then
38
+ >&2 echo -e "${RED}Error: IPs file not found: $IPS_FILE${NC}"
39
+ exit 1
40
+ fi
41
+
42
+ # Get head node and worker nodes from the IPs file
43
+ HEAD_NODE=$(head -n 1 "$IPS_FILE")
44
+ WORKER_NODES=$(tail -n +2 "$IPS_FILE")
45
+
46
+ # Check if the IPs file is empty or not formatted correctly
47
+ if [ -z "$HEAD_NODE" ]; then
48
+ >&2 echo -e "${RED}Error: IPs file is empty or not formatted correctly.${NC}"
49
+ exit 1
50
+ fi
51
+
52
+ # Function to show a progress message
53
+ progress_message() {
54
+ echo -e "${YELLOW}➜ $1${NC}"
55
+ }
56
+
57
+ # Step to display success
58
+ success_message() {
59
+ echo -e "${GREEN}✔ $1${NC}"
60
+ }
61
+
62
+ # Function to run a command on a remote machine via SSH
63
+ run_remote() {
64
+ local NODE_IP=$1
65
+ local CMD=$2
66
+ # echo -e "${YELLOW}Running command on $NODE_IP...${NC}"
67
+ ssh -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$NODE_IP" "$CMD"
68
+ }
69
+
70
+ # Function to uninstall k3s and clean up the state on a remote machine
71
+ cleanup_server_node() {
72
+ local NODE_IP=$1
73
+ echo -e "${YELLOW}Cleaning up head node $NODE_IP...${NC}"
74
+ run_remote "$NODE_IP" "
75
+ echo 'Uninstalling k3s...' &&
76
+ /usr/local/bin/k3s-uninstall.sh || true &&
77
+ sudo rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
78
+ "
79
+ echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
80
+ }
81
+
82
+ # Function to uninstall k3s and clean up the state on a remote machine
83
+ cleanup_agent_node() {
84
+ local NODE_IP=$1
85
+ echo -e "${YELLOW}Cleaning up node $NODE_IP...${NC}"
86
+ run_remote "$NODE_IP" "
87
+ echo 'Uninstalling k3s...' &&
88
+ /usr/local/bin/k3s-agent-uninstall.sh || true &&
89
+ sudo rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
90
+ "
91
+ echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
92
+ }
93
+
94
+ check_gpu() {
95
+ local NODE_IP=$1
96
+ if run_remote "$NODE_IP" "command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null"; then
97
+ return 0 # GPU detected
98
+ else
99
+ return 1 # No GPU detected
100
+ fi
101
+ }
102
+
103
+ # Pre-flight checks
104
+ run_remote "$HEAD_NODE" "echo 'SSH connection successful'"
105
+ # TODO: Add more pre-flight checks here, including checking if port 6443 is accessible
106
+
107
+ # If --cleanup flag is set, uninstall k3s and exit
108
+ if [ "$CLEANUP" == "true" ]; then
109
+ echo -e "${YELLOW}Starting cleanup...${NC}"
110
+
111
+ # Clean up head node
112
+ cleanup_server_node "$HEAD_NODE"
113
+
114
+ # Clean up worker nodes
115
+ for NODE in $WORKER_NODES; do
116
+ cleanup_agent_node "$NODE"
117
+ done
118
+
119
+ echo -e "${GREEN}Cleanup completed successfully.${NC}"
120
+ exit 0
121
+ fi
122
+
123
+ # Step 1: Install k3s on the head node
124
+ progress_message "Deploying Kubernetes on head node ($HEAD_NODE)..."
125
+ run_remote "$HEAD_NODE" "
126
+ curl -sfL https://get.k3s.io | K3S_TOKEN=$K3S_TOKEN sh - &&
127
+ mkdir -p ~/.kube &&
128
+ sudo cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
129
+ sudo chown \$(id -u):\$(id -g) ~/.kube/config &&
130
+ for i in {1..3}; do
131
+ if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
132
+ break
133
+ else
134
+ echo 'Waiting for nodes to be ready...'
135
+ sleep 5
136
+ fi
137
+ done
138
+ if [ $i -eq 3 ]; then
139
+ echo 'Failed to wait for nodes to be ready after 3 attempts'
140
+ exit 1
141
+ fi"
142
+ success_message "K3s deployed on head node."
143
+
144
+ # Check if head node has a GPU
145
+ if check_gpu "$HEAD_NODE"; then
146
+ echo -e "${YELLOW}GPU detected on head node ($HEAD_NODE).${NC}"
147
+ INSTALL_GPU=true
148
+ fi
149
+
150
+ # Fetch the head node's internal IP (this will be passed to worker nodes)
151
+ MASTER_ADDR=$(run_remote "$HEAD_NODE" "hostname -I | awk '{print \$1}'")
152
+
153
+ echo -e "${GREEN}Master node internal IP: $MASTER_ADDR${NC}"
154
+
155
+ # Step 2: Install k3s on worker nodes and join them to the master node
156
+ for NODE in $WORKER_NODES; do
157
+ progress_message "Deploying Kubernetes on worker node ($NODE)..."
158
+ run_remote "$NODE" "
159
+ curl -sfL https://get.k3s.io | K3S_URL=https://$MASTER_ADDR:6443 K3S_TOKEN=$K3S_TOKEN sh -"
160
+ success_message "Kubernetes deployed on worker node ($NODE)."
161
+
162
+ # Check if worker node has a GPU
163
+ if check_gpu "$NODE"; then
164
+ echo -e "${YELLOW}GPU detected on worker node ($NODE).${NC}"
165
+ INSTALL_GPU=true
166
+ fi
167
+ done
168
+ # Step 3: Configure local kubectl to connect to the cluster
169
+ progress_message "Configuring local kubectl to connect to the cluster..."
170
+ scp -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$HEAD_NODE":~/.kube/config ~/.kube/config
171
+
172
+ # Back up the original kubeconfig file if it exists
173
+ KUBECONFIG_FILE="$HOME/.kube/config"
174
+ if [[ -f "$KUBECONFIG_FILE" ]]; then
175
+ echo "Backing up existing kubeconfig to $KUBECONFIG_FILE.bak"
176
+ cp "$KUBECONFIG_FILE" "$KUBECONFIG_FILE.bak"
177
+ fi
178
+
179
+ # Update kubeconfig for the local machine to use the master node's IP
180
+ # Temporary file to hold the modified kubeconfig
181
+ TEMP_FILE=$(mktemp)
182
+
183
+ # Remove the certificate-authority-data, and replace the server with the master address
184
+ awk '
185
+ BEGIN { in_cluster = 0 }
186
+ /^clusters:/ { in_cluster = 1 }
187
+ /^users:/ { in_cluster = 0 }
188
+ in_cluster && /^ *certificate-authority-data:/ { next }
189
+ in_cluster && /^ *server:/ {
190
+ print " server: https://'${HEAD_NODE}:6443'"
191
+ print " insecure-skip-tls-verify: true"
192
+ next
193
+ }
194
+ { print }
195
+ ' "$KUBECONFIG_FILE" > "$TEMP_FILE"
196
+
197
+ # Replace the original kubeconfig with the modified one
198
+ mv "$TEMP_FILE" "$KUBECONFIG_FILE"
199
+
200
+ success_message "kubectl configured to connect to the cluster."
201
+
202
+ echo "Cluster deployment completed. You can now run 'kubectl get nodes' to verify the setup."
203
+
204
+ # Install GPU operator if a GPU was detected on any node
205
+ if [ "$INSTALL_GPU" == "true" ]; then
206
+ echo -e "${YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...${NC}"
207
+ run_remote "$HEAD_NODE" "
208
+ curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
209
+ chmod 700 get_helm.sh &&
210
+ ./get_helm.sh &&
211
+ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
212
+ kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
213
+ sudo ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
214
+ helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \
215
+ --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \
216
+ --set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \
217
+ --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \
218
+ --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \
219
+ --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \
220
+ --set 'toolkit.env[2].value=nvidia' &&
221
+ echo 'Waiting for GPU operator installation...' &&
222
+ while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:'; do
223
+ echo 'Waiting for GPU operator...'
224
+ sleep 5
225
+ done
226
+ echo 'GPU operator installed successfully.'"
227
+ success_message "GPU Operator installed."
228
+ else
229
+ echo -e "${YELLOW}No GPUs detected. Skipping GPU Operator installation.${NC}"
230
+ fi
231
+
232
+ # Configure SkyPilot
233
+ progress_message "Configuring SkyPilot..."
234
+ sky check kubernetes
235
+ success_message "SkyPilot configured successfully."
236
+
237
+ # Display final success message
238
+ echo -e "${GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}"
239
+ echo "You can now interact with your Kubernetes cluster through SkyPilot: "
240
+ echo " • List available GPUs: sky show-gpus --cloud kubernetes"
241
+ echo " • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes --gpus A100:1"
242
+ echo " • Connect to pod with SSH: ssh devbox"
243
+ echo " • Connect to pod with VSCode: code --remote ssh-remote+devbox '/'"