skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -1,12 +1,11 @@
1
- """Utility functions for deploying Kubernetes clusters."""
1
+ """Utility functions for deploying local Kubernetes kind clusters."""
2
2
  import os
3
+ import random
3
4
  import shlex
4
5
  import subprocess
5
- import sys
6
6
  import tempfile
7
- from typing import List, Optional
8
-
9
- import colorama
7
+ import textwrap
8
+ from typing import Optional, Tuple
10
9
 
11
10
  from sky import check as sky_check
12
11
  from sky import sky_logging
@@ -24,279 +23,154 @@ logger = sky_logging.init_logger(__name__)
24
23
 
25
24
  # Default path for Kubernetes configuration file
26
25
  DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
26
+ DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
27
+ LOCAL_CLUSTER_PORT_RANGE = 100
28
+ LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
29
+ LOCAL_CLUSTER_INTERNAL_PORT_END = 30099
27
30
 
28
31
 
29
- def check_ssh_cluster_dependencies(
30
- raise_error: bool = True) -> Optional[List[str]]:
31
- """Checks if the dependencies for ssh cluster are installed.
32
-
33
- Args:
34
- raise_error: set to true when the dependency needs to be present.
35
- set to false for `sky check`, where reason strings are compiled
36
- at the end.
37
-
38
- Returns: the reasons list if there are missing dependencies.
39
- """
40
- # error message
41
- jq_message = ('`jq` is required to setup ssh cluster.')
42
-
43
- # save
44
- reasons = []
45
- required_binaries = []
46
-
47
- # Ensure jq is installed
48
- try:
49
- subprocess.run(['jq', '--version'],
50
- stdout=subprocess.DEVNULL,
51
- stderr=subprocess.DEVNULL,
52
- check=True)
53
- except (FileNotFoundError, subprocess.CalledProcessError):
54
- required_binaries.append('jq')
55
- reasons.append(jq_message)
56
-
57
- if required_binaries:
58
- reasons.extend([
59
- 'On Debian/Ubuntu, install the missing dependenc(ies) with:',
60
- f' $ sudo apt install {" ".join(required_binaries)}',
61
- 'On MacOS, install with: ',
62
- f' $ brew install {" ".join(required_binaries)}',
63
- ])
64
- if raise_error:
65
- with ux_utils.print_exception_no_traceback():
66
- raise RuntimeError('\n'.join(reasons))
67
- return reasons
68
- return None
69
-
70
-
71
- def deploy_ssh_cluster(cleanup: bool = False,
72
- infra: Optional[str] = None,
73
- kubeconfig_path: Optional[str] = None):
74
- """Deploy a Kubernetes cluster on SSH targets.
32
+ def generate_kind_config(port_start: int,
33
+ num_nodes: int = 1,
34
+ gpus: bool = False) -> str:
35
+ """Generate a kind cluster config with ports mapped from host to container
75
36
 
76
- This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
77
- Kubernetes cluster on the specified machines.
37
+ Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
38
+ Internally, this will map to ports 30000 - 30099
78
39
 
79
40
  Args:
80
- cleanup: Whether to clean up the cluster instead of deploying.
81
- infra: Name of the cluster in ssh_node_pools.yaml to use.
82
- If None, the first cluster in the file will be used.
83
- kubeconfig_path: Path to save the Kubernetes configuration file.
84
- If None, the default ~/.kube/config will be used.
85
- """
86
- check_ssh_cluster_dependencies()
87
-
88
- # Prepare command to call deploy_remote_cluster.py script
89
- # TODO(romilb): We should move this to a native python method/class call
90
- # instead of invoking a script with subprocess.
91
- path_to_package = os.path.dirname(__file__)
92
- up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
93
- cwd = os.path.dirname(os.path.abspath(up_script_path))
94
-
95
- deploy_command = [sys.executable, up_script_path]
41
+ path: Path to generate the config file at
42
+ port_start: Port range start for mappings
43
+ num_nodes: Number of nodes in the cluster
44
+ gpus: If true, initialize kind cluster with GPU support
96
45
 
97
- if cleanup:
98
- deploy_command.append('--cleanup')
99
-
100
- if infra:
101
- deploy_command.extend(['--infra', infra])
102
-
103
- # Use the default kubeconfig path if none is provided
104
- kubeconfig_path = kubeconfig_path or DEFAULT_KUBECONFIG_PATH
105
- deploy_command.extend(['--kubeconfig-path', kubeconfig_path])
106
-
107
- # Setup logging paths
108
- run_timestamp = sky_logging.get_run_timestamp()
109
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
110
- 'ssh_up.log')
111
-
112
- if cleanup:
113
- msg_str = 'Cleaning up SSH Node Pools...'
114
- else:
115
- msg_str = 'Initializing deployment to SSH Node Pools...'
116
-
117
- # Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
118
- env = os.environ.copy()
119
- env['PYTHONUNBUFFERED'] = '1'
120
-
121
- with rich_utils.safe_status(
122
- ux_utils.spinner_message(msg_str, log_path=log_path,
123
- is_local=True)):
124
- returncode, _, stderr = log_lib.run_with_log(
125
- cmd=deploy_command,
126
- log_path=log_path,
127
- require_outputs=True,
128
- stream_logs=False,
129
- line_processor=log_utils.SkySSHUpLineProcessor(log_path=log_path,
130
- is_local=False),
131
- cwd=cwd,
132
- env=env)
46
+ Returns:
47
+ The kind cluster config
48
+ """
49
+ internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
50
+ internal_end = LOCAL_CLUSTER_INTERNAL_PORT_END
51
+
52
+ config = textwrap.dedent(f"""
53
+ apiVersion: kind.x-k8s.io/v1alpha4
54
+ kind: Cluster
55
+ kubeadmConfigPatches:
56
+ - |
57
+ kind: ClusterConfiguration
58
+ apiServer:
59
+ extraArgs:
60
+ "service-node-port-range": {internal_start}-{internal_end}
61
+ nodes:
62
+ - role: control-plane
63
+ kubeadmConfigPatches:
64
+ - |
65
+ kind: InitConfiguration
66
+ nodeRegistration:
67
+ kubeletExtraArgs:
68
+ node-labels: "ingress-ready=true"
69
+ """)
70
+ if gpus:
71
+ config += textwrap.indent(
72
+ textwrap.dedent("""
73
+ extraMounts:
74
+ - hostPath: /dev/null
75
+ containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
76
+ config += textwrap.indent(textwrap.dedent("""
77
+ extraPortMappings:"""), ' ' * 2)
78
+ for offset in range(LOCAL_CLUSTER_PORT_RANGE):
79
+ config += textwrap.indent(
80
+ textwrap.dedent(f"""
81
+ - containerPort: {internal_start + offset}
82
+ hostPort: {port_start + offset}
83
+ listenAddress: "0.0.0.0"
84
+ protocol: tcp
85
+ """), ' ' * 2)
86
+ if num_nodes > 1:
87
+ config += '- role: worker\n' * (num_nodes - 1)
88
+ return config
89
+
90
+
91
+ def _get_port_range(name: str, port_start: Optional[int]) -> Tuple[int, int]:
92
+ is_default = name == DEFAULT_LOCAL_CLUSTER_NAME
93
+ if port_start is None:
94
+ if is_default:
95
+ port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
96
+ else:
97
+ port_start = random.randint(301, 399) * 100
98
+ port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
99
+
100
+ port_range = f'Current port range: {port_start}-{port_end}'
101
+ if is_default and port_start != LOCAL_CLUSTER_INTERNAL_PORT_START:
102
+ raise ValueError('Default local cluster `skypilot` should have '
103
+ f'port range from 30000 to 30099. {port_range}.')
104
+ if not is_default and port_start == LOCAL_CLUSTER_INTERNAL_PORT_START:
105
+ raise ValueError('Port range 30000 to 30099 is reserved for '
106
+ f'default local cluster `skypilot`. {port_range}.')
107
+ if port_start % 100 != 0:
108
+ raise ValueError('Local cluster port start must be a multiple of 100. '
109
+ f'{port_range}.')
110
+
111
+ return port_start, port_end
112
+
113
+
114
+ def deploy_local_cluster(name: Optional[str], port_start: Optional[int],
115
+ gpus: bool):
116
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
117
+ port_start, port_end = _get_port_range(name, port_start)
118
+ context_name = f'kind-{name}'
119
+ cluster_created = False
133
120
 
134
- if returncode == 0:
135
- success = True
136
- else:
137
- with ux_utils.print_exception_no_traceback():
138
- log_hint = ux_utils.log_path_hint(log_path, is_local=False)
139
- raise RuntimeError('Failed to deploy SkyPilot on some Node Pools. '
140
- f'{log_hint}'
141
- f'\nError: {stderr}')
121
+ # Check if GPUs are available on the host
122
+ local_gpus_available = backend_utils.check_local_gpus()
123
+ gpus = gpus and local_gpus_available
142
124
 
143
- if success:
144
- # Add an empty line to separate the deployment logs from the final
145
- # message
146
- logger.info('')
147
- if cleanup:
148
- logger.info(
149
- ux_utils.finishing_message(
150
- '🎉 SSH Node Pools cleaned up successfully.',
151
- log_path=log_path,
152
- is_local=True))
153
- else:
125
+ # Check if ~/.kube/config exists:
126
+ if os.path.exists(os.path.expanduser('~/.kube/config')):
127
+ curr_context = kubernetes_utils.get_current_kube_config_context_name()
128
+ if curr_context is not None and curr_context != context_name:
154
129
  logger.info(
155
- ux_utils.finishing_message(
156
- '🎉 SSH Node Pools set up successfully. ',
157
- follow_up_message=(
158
- f'Run `{colorama.Style.BRIGHT}'
159
- f'sky check ssh'
160
- f'{colorama.Style.RESET_ALL}` to verify access, '
161
- f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
162
- f'{colorama.Style.RESET_ALL}` to launch a cluster. '),
163
- log_path=log_path,
164
- is_local=True))
165
-
166
-
167
- def deploy_remote_cluster(ip_list: List[str],
168
- ssh_user: str,
169
- ssh_key: str,
170
- cleanup: bool,
171
- context_name: Optional[str] = None,
172
- password: Optional[str] = None):
173
- success = False
174
- path_to_package = os.path.dirname(__file__)
175
- up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
176
- # Get directory of script and run it from there
177
- cwd = os.path.dirname(os.path.abspath(up_script_path))
178
-
179
- # Create temporary files for the IPs and SSH key
180
- with tempfile.NamedTemporaryFile(mode='w') as ip_file, \
181
- tempfile.NamedTemporaryFile(mode='w') as key_file:
182
-
183
- # Write IPs and SSH key to temporary files
184
- ip_file.write('\n'.join(ip_list))
185
- ip_file.flush()
186
-
187
- key_file.write(ssh_key)
188
- key_file.flush()
189
- os.chmod(key_file.name, 0o600)
190
-
191
- # Use the legacy mode command line arguments for backward compatibility
192
- deploy_command = [
193
- sys.executable, up_script_path, '--ips-file', ip_file.name,
194
- '--user', ssh_user, '--ssh-key', key_file.name
195
- ]
196
-
197
- if context_name is not None:
198
- deploy_command.extend(['--context-name', context_name])
199
- if password is not None:
200
- deploy_command.extend(['--password', password])
201
- if cleanup:
202
- deploy_command.append('--cleanup')
130
+ f'Current context in kube config: {curr_context}'
131
+ f'\nWill automatically switch to {context_name} after the '
132
+ 'local cluster is created.')
133
+ message_str = 'Creating local cluster {}{}...'
134
+ message_str = message_str.format(
135
+ name,
136
+ ' with GPU support (this may take up to 15 minutes)' if gpus else '')
137
+
138
+ with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
139
+ delete=True) as f:
140
+ # Choose random port range to use on the host machine.
141
+ # Port range is port_start - port_start + 99 (exactly 100 ports).
142
+ logger.debug(f'Using host port range {port_start}-{port_end}')
143
+ f.write(generate_kind_config(port_start, gpus=gpus))
144
+ f.flush()
145
+
146
+ path_to_package = os.path.dirname(__file__)
147
+ up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
148
+
149
+ # Get directory of script and run it from there
150
+ cwd = os.path.dirname(os.path.abspath(up_script_path))
151
+ run_command = f'{up_script_path} {name} {f.name}'
152
+ if gpus:
153
+ run_command += ' --gpus'
154
+ run_command = shlex.split(run_command)
203
155
 
204
156
  # Setup logging paths
205
157
  run_timestamp = sky_logging.get_run_timestamp()
206
158
  log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
207
159
  'local_up.log')
208
-
209
- if cleanup:
210
- msg_str = 'Cleaning up remote cluster...'
211
- else:
212
- msg_str = 'Deploying remote cluster...'
213
-
214
- # Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
215
- env = os.environ.copy()
216
- env['PYTHONUNBUFFERED'] = '1'
160
+ logger.info(message_str)
217
161
 
218
162
  with rich_utils.safe_status(
219
- ux_utils.spinner_message(msg_str,
163
+ ux_utils.spinner_message(message_str,
220
164
  log_path=log_path,
221
165
  is_local=True)):
222
166
  returncode, _, stderr = log_lib.run_with_log(
223
- cmd=deploy_command,
167
+ cmd=run_command,
224
168
  log_path=log_path,
225
169
  require_outputs=True,
226
170
  stream_logs=False,
227
- line_processor=log_utils.SkyRemoteUpLineProcessor(
171
+ line_processor=log_utils.SkyLocalUpLineProcessor(
228
172
  log_path=log_path, is_local=True),
229
- cwd=cwd,
230
- env=env)
231
- if returncode == 0:
232
- success = True
233
- else:
234
- with ux_utils.print_exception_no_traceback():
235
- log_hint = ux_utils.log_path_hint(log_path, is_local=True)
236
- raise RuntimeError('Failed to deploy remote cluster. '
237
- f'Full log: {log_hint}'
238
- f'\nError: {stderr}')
239
-
240
- if success:
241
- if cleanup:
242
- logger.info(
243
- ux_utils.finishing_message(
244
- '🎉 Remote cluster cleaned up successfully.',
245
- log_path=log_path,
246
- is_local=True))
247
- else:
248
- logger.info(
249
- ux_utils.finishing_message(
250
- '🎉 Remote cluster deployed successfully.',
251
- log_path=log_path,
252
- is_local=True))
253
-
254
-
255
- def deploy_local_cluster(gpus: bool):
256
- cluster_created = False
257
-
258
- # Check if GPUs are available on the host
259
- local_gpus_available = backend_utils.check_local_gpus()
260
- gpus = gpus and local_gpus_available
261
-
262
- # Check if ~/.kube/config exists:
263
- if os.path.exists(os.path.expanduser('~/.kube/config')):
264
- curr_context = kubernetes_utils.get_current_kube_config_context_name()
265
- skypilot_context = 'kind-skypilot'
266
- if curr_context is not None and curr_context != skypilot_context:
267
- logger.info(
268
- f'Current context in kube config: {curr_context}'
269
- '\nWill automatically switch to kind-skypilot after the local '
270
- 'cluster is created.')
271
- message_str = 'Creating local cluster{}...'
272
- message_str = message_str.format((' with GPU support (this may take up '
273
- 'to 15 minutes)') if gpus else '')
274
- path_to_package = os.path.dirname(__file__)
275
- up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
276
-
277
- # Get directory of script and run it from there
278
- cwd = os.path.dirname(os.path.abspath(up_script_path))
279
- run_command = up_script_path + ' --gpus' if gpus else up_script_path
280
- run_command = shlex.split(run_command)
281
-
282
- # Setup logging paths
283
- run_timestamp = sky_logging.get_run_timestamp()
284
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
285
- 'local_up.log')
286
- logger.info(message_str)
287
-
288
- with rich_utils.safe_status(
289
- ux_utils.spinner_message(message_str,
290
- log_path=log_path,
291
- is_local=True)):
292
- returncode, _, stderr = log_lib.run_with_log(
293
- cmd=run_command,
294
- log_path=log_path,
295
- require_outputs=True,
296
- stream_logs=False,
297
- line_processor=log_utils.SkyLocalUpLineProcessor(log_path=log_path,
298
- is_local=True),
299
- cwd=cwd)
173
+ cwd=cwd)
300
174
 
301
175
  # Kind always writes to stderr even if it succeeds.
302
176
  # If the failure happens after the cluster is created, we need
@@ -309,11 +183,11 @@ def deploy_local_cluster(gpus: bool):
309
183
  elif returncode == 100:
310
184
  logger.info(
311
185
  ux_utils.finishing_message(
312
- 'Local cluster already exists.\n',
186
+ f'Local cluster {name} already exists.\n',
313
187
  log_path=log_path,
314
188
  is_local=True,
315
189
  follow_up_message=
316
- 'If you want to delete it instead, run: sky local down'))
190
+ 'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
317
191
  else:
318
192
  with ux_utils.print_exception_no_traceback():
319
193
  log_hint = ux_utils.log_path_hint(log_path, is_local=True)
@@ -339,7 +213,7 @@ def deploy_local_cluster(gpus: bool):
339
213
  if gpus:
340
214
  # Get GPU model by querying the node labels
341
215
  label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
342
- gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
216
+ gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
343
217
  try:
344
218
  # Run the command and capture the output
345
219
  gpu_count_output = subprocess.check_output(gpu_type_cmd,
@@ -375,8 +249,10 @@ def deploy_local_cluster(gpus: bool):
375
249
  'This may cause issues with running tasks.')
376
250
  logger.info(
377
251
  ux_utils.finishing_message(
378
- message=(f'Local Kubernetes cluster created successfully with '
379
- f'{num_cpus} CPUs{gpu_message}.'),
252
+ message=(
253
+ f'Local Kubernetes cluster {name} created successfully '
254
+ f'with {num_cpus} CPUs{gpu_message} on host port range '
255
+ f'{port_start}-{port_end}.'),
380
256
  log_path=log_path,
381
257
  is_local=True,
382
258
  follow_up_message=(
@@ -384,3 +260,54 @@ def deploy_local_cluster(gpus: bool):
384
260
  'Hint: To change the number of CPUs, change your docker '
385
261
  'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
386
262
  f'{gpu_hint}')))
263
+
264
+
265
+ def teardown_local_cluster(name: Optional[str] = None):
266
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
267
+ cluster_removed = False
268
+
269
+ path_to_package = os.path.dirname(__file__)
270
+ down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
271
+
272
+ cwd = os.path.dirname(os.path.abspath(down_script_path))
273
+ run_command = f'{down_script_path} {name}'
274
+ run_command = shlex.split(run_command)
275
+
276
+ # Setup logging paths
277
+ run_timestamp = sky_logging.get_run_timestamp()
278
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
279
+ 'local_down.log')
280
+
281
+ with rich_utils.safe_status(
282
+ ux_utils.spinner_message(f'Removing local cluster {name}',
283
+ log_path=log_path,
284
+ is_local=True)):
285
+
286
+ returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
287
+ log_path=log_path,
288
+ require_outputs=True,
289
+ stream_logs=False,
290
+ cwd=cwd)
291
+ stderr = stderr.replace('No kind clusters found.\n', '')
292
+
293
+ if returncode == 0:
294
+ cluster_removed = True
295
+ elif returncode == 100:
296
+ logger.info(
297
+ ux_utils.error_message(f'Local cluster {name} does not exist.'))
298
+ else:
299
+ with ux_utils.print_exception_no_traceback():
300
+ raise RuntimeError(f'Failed to down local cluster {name}. '
301
+ f'Stdout: {stdout}'
302
+ f'\nError: {stderr}')
303
+ if cluster_removed:
304
+ # Run sky check
305
+ with rich_utils.safe_status(
306
+ ux_utils.spinner_message('Running sky check...')):
307
+ sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
308
+ clouds=['kubernetes'],
309
+ quiet=True)
310
+ logger.info(
311
+ ux_utils.finishing_message(f'Local cluster {name} removed.',
312
+ log_path=log_path,
313
+ is_local=True))
@@ -48,8 +48,16 @@ fi
48
48
 
49
49
  if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
50
50
  # If context is none, it means we are using incluster auth. In this case,
51
- # use need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
52
- kubectl exec -i "$resource_type/$resource_name" -n "$namespace" --kubeconfig=/dev/null -- "$@"
51
+ # we need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
52
+ kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --kubeconfig=/dev/null --"
53
53
  else
54
- kubectl exec -i "$resource_type/$resource_name" -n "$namespace" --context="$context" -- "$@"
54
+ kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --context=\"$context\" --"
55
55
  fi
56
+
57
+ # Execute command on remote pod, waiting for rsync to be available first.
58
+ # The waiting happens on the remote pod, not locally, which is more efficient
59
+ # and reliable than polling from the local machine.
60
+ # We wrap the command in a bash script that waits for rsync, then execs the original command.
61
+ # Timeout after MAX_WAIT_TIME_SECONDS seconds.
62
+ MAX_WAIT_TIME_SECONDS=300
63
+ eval "${kubectl_cmd_base% --} -i -- bash -c 'count=0; max_count=$MAX_WAIT_TIME_SECONDS*2; until which rsync >/dev/null 2>&1; do if [ \$count -ge \$max_count ]; then echo \"Error when trying to rsync files to kubernetes cluster. Package installation may have failed.\" >&2; exit 1; fi; sleep 0.5; count=\$((count+1)); done; exec \"\$@\"' -- \"\$@\""