skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +1,13 @@
1
1
  """Utility functions for deploying Kubernetes clusters."""
2
2
  import os
3
+ import random
3
4
  import shlex
4
5
  import subprocess
5
6
  import tempfile
6
- from typing import List, Optional
7
+ import textwrap
8
+ from typing import List, Optional, Tuple
9
+
10
+ import colorama
7
11
 
8
12
  from sky import check as sky_check
9
13
  from sky import sky_logging
@@ -16,92 +20,194 @@ from sky.utils import log_utils
16
20
  from sky.utils import rich_utils
17
21
  from sky.utils import subprocess_utils
18
22
  from sky.utils import ux_utils
23
+ from sky.utils.kubernetes import deploy_ssh_node_pools
19
24
 
20
25
  logger = sky_logging.init_logger(__name__)
21
26
 
27
+ # Default path for Kubernetes configuration file
28
+ DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
29
+ DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
30
+ LOCAL_CLUSTER_PORT_RANGE = 100
31
+ LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
32
+ LOCAL_CLUSTER_INTERNAL_PORT_END = 30099
22
33
 
23
- def deploy_remote_cluster(ip_list: List[str],
24
- ssh_user: str,
25
- ssh_key: str,
26
- cleanup: bool,
27
- context_name: Optional[str] = None,
28
- password: Optional[str] = None):
29
- success = False
30
- path_to_package = os.path.dirname(__file__)
31
- up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.sh')
32
- # Get directory of script and run it from there
33
- cwd = os.path.dirname(os.path.abspath(up_script_path))
34
-
35
- # Create temporary files for the IPs and SSH key
36
- with tempfile.NamedTemporaryFile(mode='w') as ip_file, \
37
- tempfile.NamedTemporaryFile(mode='w') as key_file:
38
-
39
- # Write IPs and SSH key to temporary files
40
- ip_file.write('\n'.join(ip_list))
41
- ip_file.flush()
42
-
43
- key_file.write(ssh_key)
44
- key_file.flush()
45
- os.chmod(key_file.name, 0o600)
46
-
47
- deploy_command = (f'{up_script_path} {ip_file.name} '
48
- f'{ssh_user} {key_file.name}')
49
- if context_name is not None:
50
- deploy_command += f' {context_name}'
51
- if password is not None:
52
- deploy_command += f' --password {password}'
53
- if cleanup:
54
- deploy_command += ' --cleanup'
55
-
56
- # Convert the command to a format suitable for subprocess
57
- deploy_command = shlex.split(deploy_command)
58
34
 
59
- # Setup logging paths
60
- run_timestamp = sky_logging.get_run_timestamp()
61
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
62
- 'local_up.log')
35
+ def check_ssh_cluster_dependencies(
36
+ raise_error: bool = True) -> Optional[List[str]]:
37
+ """Checks if the dependencies for ssh cluster are installed.
63
38
 
64
- if cleanup:
65
- msg_str = 'Cleaning up remote cluster...'
66
- else:
67
- msg_str = 'Deploying remote cluster...'
68
- with rich_utils.safe_status(
69
- ux_utils.spinner_message(msg_str,
70
- log_path=log_path,
71
- is_local=True)):
72
- returncode, _, stderr = log_lib.run_with_log(
73
- cmd=deploy_command,
74
- log_path=log_path,
75
- require_outputs=True,
76
- stream_logs=False,
77
- line_processor=log_utils.SkyRemoteUpLineProcessor(
78
- log_path=log_path, is_local=True),
79
- cwd=cwd)
80
- if returncode == 0:
81
- success = True
82
- else:
39
+ Args:
40
+ raise_error: set to true when the dependency needs to be present.
41
+ set to false for `sky check`, where reason strings are compiled
42
+ at the end.
43
+
44
+ Returns: the reasons list if there are missing dependencies.
45
+ """
46
+ # error message
47
+ jq_message = ('`jq` is required to setup ssh cluster.')
48
+
49
+ # save
50
+ reasons = []
51
+ required_binaries = []
52
+
53
+ # Ensure jq is installed
54
+ try:
55
+ subprocess.run(['jq', '--version'],
56
+ stdout=subprocess.DEVNULL,
57
+ stderr=subprocess.DEVNULL,
58
+ check=True)
59
+ except (FileNotFoundError, subprocess.CalledProcessError):
60
+ required_binaries.append('jq')
61
+ reasons.append(jq_message)
62
+
63
+ if required_binaries:
64
+ reasons.extend([
65
+ 'On Debian/Ubuntu, install the missing dependenc(ies) with:',
66
+ f' $ sudo apt install {" ".join(required_binaries)}',
67
+ 'On MacOS, install with: ',
68
+ f' $ brew install {" ".join(required_binaries)}',
69
+ ])
70
+ if raise_error:
83
71
  with ux_utils.print_exception_no_traceback():
84
- log_hint = ux_utils.log_path_hint(log_path, is_local=True)
85
- raise RuntimeError('Failed to deploy remote cluster. '
86
- f'Full log: {log_hint}'
87
- f'\nError: {stderr}')
72
+ raise RuntimeError('\n'.join(reasons))
73
+ return reasons
74
+ return None
75
+
76
+
77
+ def deploy_ssh_cluster(cleanup: bool = False,
78
+ infra: Optional[str] = None,
79
+ kubeconfig_path: Optional[str] = None):
80
+ """Deploy a Kubernetes cluster on SSH targets.
81
+
82
+ This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
83
+ Kubernetes cluster on the specified machines.
84
+
85
+ Args:
86
+ cleanup: Whether to clean up the cluster instead of deploying.
87
+ infra: Name of the cluster in ssh_node_pools.yaml to use.
88
+ If None, the first cluster in the file will be used.
89
+ kubeconfig_path: Path to save the Kubernetes configuration file.
90
+ If None, the default ~/.kube/config will be used.
91
+ """
92
+ check_ssh_cluster_dependencies()
93
+
94
+ action = 'Cleanup' if cleanup else 'Deployment'
95
+ msg_str = f'Initializing SSH Node Pools {action}...'
96
+
97
+ with rich_utils.safe_status(ux_utils.spinner_message(msg_str)):
98
+ try:
99
+ deploy_ssh_node_pools.deploy_clusters(
100
+ infra=infra, cleanup=cleanup, kubeconfig_path=kubeconfig_path)
101
+ except Exception as e: # pylint: disable=broad-except
102
+ logger.error(str(e))
103
+ with ux_utils.print_exception_no_traceback():
104
+ raise RuntimeError(
105
+ 'Failed to deploy SkyPilot on some Node Pools.') from e
106
+
107
+ logger.info('')
108
+ if cleanup:
109
+ logger.info(
110
+ ux_utils.finishing_message(
111
+ '🎉 SSH Node Pools cleaned up successfully.'))
112
+ else:
113
+ logger.info(
114
+ ux_utils.finishing_message(
115
+ '🎉 SSH Node Pools set up successfully. ',
116
+ follow_up_message=(
117
+ f'Run `{colorama.Style.BRIGHT}'
118
+ f'sky check ssh'
119
+ f'{colorama.Style.RESET_ALL}` to verify access, '
120
+ f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
121
+ f'{colorama.Style.RESET_ALL}` to launch a cluster.')))
122
+
123
+
124
+ def generate_kind_config(port_start: int,
125
+ num_nodes: int = 1,
126
+ gpus: bool = False) -> str:
127
+ """Generate a kind cluster config with ports mapped from host to container
128
+
129
+ Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
130
+ Internally, this will map to ports 30000 - 30099
131
+
132
+ Args:
133
+ path: Path to generate the config file at
134
+ port_start: Port range start for mappings
135
+ num_nodes: Number of nodes in the cluster
136
+ gpus: If true, initialize kind cluster with GPU support
137
+
138
+ Returns:
139
+ The kind cluster config
140
+ """
141
+ internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
142
+ internal_end = LOCAL_CLUSTER_INTERNAL_PORT_END
143
+
144
+ config = textwrap.dedent(f"""
145
+ apiVersion: kind.x-k8s.io/v1alpha4
146
+ kind: Cluster
147
+ kubeadmConfigPatches:
148
+ - |
149
+ kind: ClusterConfiguration
150
+ apiServer:
151
+ extraArgs:
152
+ "service-node-port-range": {internal_start}-{internal_end}
153
+ nodes:
154
+ - role: control-plane
155
+ kubeadmConfigPatches:
156
+ - |
157
+ kind: InitConfiguration
158
+ nodeRegistration:
159
+ kubeletExtraArgs:
160
+ node-labels: "ingress-ready=true"
161
+ """)
162
+ if gpus:
163
+ config += textwrap.indent(
164
+ textwrap.dedent("""
165
+ extraMounts:
166
+ - hostPath: /dev/null
167
+ containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
168
+ config += textwrap.indent(textwrap.dedent("""
169
+ extraPortMappings:"""), ' ' * 2)
170
+ for offset in range(LOCAL_CLUSTER_PORT_RANGE):
171
+ config += textwrap.indent(
172
+ textwrap.dedent(f"""
173
+ - containerPort: {internal_start + offset}
174
+ hostPort: {port_start + offset}
175
+ listenAddress: "0.0.0.0"
176
+ protocol: tcp
177
+ """), ' ' * 2)
178
+ if num_nodes > 1:
179
+ config += '- role: worker\n' * (num_nodes - 1)
180
+ return config
181
+
182
+
183
+ def _get_port_range(name: str, port_start: Optional[int]) -> Tuple[int, int]:
184
+ is_default = name == DEFAULT_LOCAL_CLUSTER_NAME
185
+ if port_start is None:
186
+ if is_default:
187
+ port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
188
+ else:
189
+ port_start = random.randint(301, 399) * 100
190
+ port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
191
+
192
+ port_range = f'Current port range: {port_start}-{port_end}'
193
+ if is_default and port_start != LOCAL_CLUSTER_INTERNAL_PORT_START:
194
+ raise ValueError('Default local cluster `skypilot` should have '
195
+ f'port range from 30000 to 30099. {port_range}.')
196
+ if not is_default and port_start == LOCAL_CLUSTER_INTERNAL_PORT_START:
197
+ raise ValueError('Port range 30000 to 30099 is reserved for '
198
+ f'default local cluster `skypilot`. {port_range}.')
199
+ if port_start % 100 != 0:
200
+ raise ValueError('Local cluster port start must be a multiple of 100. '
201
+ f'{port_range}.')
202
+
203
+ return port_start, port_end
88
204
 
89
- if success:
90
- if cleanup:
91
- logger.info(
92
- ux_utils.finishing_message(
93
- '🎉 Remote cluster cleaned up successfully.',
94
- log_path=log_path,
95
- is_local=True))
96
- else:
97
- logger.info(
98
- ux_utils.finishing_message(
99
- '🎉 Remote cluster deployed successfully.',
100
- log_path=log_path,
101
- is_local=True))
102
-
103
-
104
- def deploy_local_cluster(gpus: bool):
205
+
206
+ def deploy_local_cluster(name: Optional[str], port_start: Optional[int],
207
+ gpus: bool):
208
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
209
+ port_start, port_end = _get_port_range(name, port_start)
210
+ context_name = f'kind-{name}'
105
211
  cluster_created = False
106
212
 
107
213
  # Check if GPUs are available on the host
@@ -111,41 +217,52 @@ def deploy_local_cluster(gpus: bool):
111
217
  # Check if ~/.kube/config exists:
112
218
  if os.path.exists(os.path.expanduser('~/.kube/config')):
113
219
  curr_context = kubernetes_utils.get_current_kube_config_context_name()
114
- skypilot_context = 'kind-skypilot'
115
- if curr_context is not None and curr_context != skypilot_context:
220
+ if curr_context is not None and curr_context != context_name:
116
221
  logger.info(
117
222
  f'Current context in kube config: {curr_context}'
118
- '\nWill automatically switch to kind-skypilot after the local '
119
- 'cluster is created.')
120
- message_str = 'Creating local cluster{}...'
121
- message_str = message_str.format((' with GPU support (this may take up '
122
- 'to 15 minutes)') if gpus else '')
123
- path_to_package = os.path.dirname(__file__)
124
- up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
223
+ f'\nWill automatically switch to {context_name} after the '
224
+ 'local cluster is created.')
225
+ message_str = 'Creating local cluster {}{}...'
226
+ message_str = message_str.format(
227
+ name,
228
+ ' with GPU support (this may take up to 15 minutes)' if gpus else '')
125
229
 
126
- # Get directory of script and run it from there
127
- cwd = os.path.dirname(os.path.abspath(up_script_path))
128
- run_command = up_script_path + ' --gpus' if gpus else up_script_path
129
- run_command = shlex.split(run_command)
230
+ with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
231
+ delete=True) as f:
232
+ # Choose random port range to use on the host machine.
233
+ # Port range is port_start - port_start + 99 (exactly 100 ports).
234
+ logger.debug(f'Using host port range {port_start}-{port_end}')
235
+ f.write(generate_kind_config(port_start, gpus=gpus))
236
+ f.flush()
130
237
 
131
- # Setup logging paths
132
- run_timestamp = sky_logging.get_run_timestamp()
133
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
134
- 'local_up.log')
135
- logger.info(message_str)
238
+ path_to_package = os.path.dirname(__file__)
239
+ up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
136
240
 
137
- with rich_utils.safe_status(
138
- ux_utils.spinner_message(message_str,
139
- log_path=log_path,
140
- is_local=True)):
141
- returncode, _, stderr = log_lib.run_with_log(
142
- cmd=run_command,
143
- log_path=log_path,
144
- require_outputs=True,
145
- stream_logs=False,
146
- line_processor=log_utils.SkyLocalUpLineProcessor(log_path=log_path,
147
- is_local=True),
148
- cwd=cwd)
241
+ # Get directory of script and run it from there
242
+ cwd = os.path.dirname(os.path.abspath(up_script_path))
243
+ run_command = f'{up_script_path} {name} {f.name}'
244
+ if gpus:
245
+ run_command += ' --gpus'
246
+ run_command = shlex.split(run_command)
247
+
248
+ # Setup logging paths
249
+ run_timestamp = sky_logging.get_run_timestamp()
250
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
251
+ 'local_up.log')
252
+ logger.info(message_str)
253
+
254
+ with rich_utils.safe_status(
255
+ ux_utils.spinner_message(message_str,
256
+ log_path=log_path,
257
+ is_local=True)):
258
+ returncode, _, stderr = log_lib.run_with_log(
259
+ cmd=run_command,
260
+ log_path=log_path,
261
+ require_outputs=True,
262
+ stream_logs=False,
263
+ line_processor=log_utils.SkyLocalUpLineProcessor(
264
+ log_path=log_path, is_local=True),
265
+ cwd=cwd)
149
266
 
150
267
  # Kind always writes to stderr even if it succeeds.
151
268
  # If the failure happens after the cluster is created, we need
@@ -158,11 +275,11 @@ def deploy_local_cluster(gpus: bool):
158
275
  elif returncode == 100:
159
276
  logger.info(
160
277
  ux_utils.finishing_message(
161
- 'Local cluster already exists.\n',
278
+ f'Local cluster {name} already exists.\n',
162
279
  log_path=log_path,
163
280
  is_local=True,
164
281
  follow_up_message=
165
- 'If you want to delete it instead, run: sky local down'))
282
+ 'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
166
283
  else:
167
284
  with ux_utils.print_exception_no_traceback():
168
285
  log_hint = ux_utils.log_path_hint(log_path, is_local=True)
@@ -188,7 +305,7 @@ def deploy_local_cluster(gpus: bool):
188
305
  if gpus:
189
306
  # Get GPU model by querying the node labels
190
307
  label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
191
- gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
308
+ gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
192
309
  try:
193
310
  # Run the command and capture the output
194
311
  gpu_count_output = subprocess.check_output(gpu_type_cmd,
@@ -224,8 +341,10 @@ def deploy_local_cluster(gpus: bool):
224
341
  'This may cause issues with running tasks.')
225
342
  logger.info(
226
343
  ux_utils.finishing_message(
227
- message=(f'Local Kubernetes cluster created successfully with '
228
- f'{num_cpus} CPUs{gpu_message}.'),
344
+ message=(
345
+ f'Local Kubernetes cluster {name} created successfully '
346
+ f'with {num_cpus} CPUs{gpu_message} on host port range '
347
+ f'{port_start}-{port_end}.'),
229
348
  log_path=log_path,
230
349
  is_local=True,
231
350
  follow_up_message=(
@@ -233,3 +352,54 @@ def deploy_local_cluster(gpus: bool):
233
352
  'Hint: To change the number of CPUs, change your docker '
234
353
  'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
235
354
  f'{gpu_hint}')))
355
+
356
+
357
+ def teardown_local_cluster(name: Optional[str] = None):
358
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
359
+ cluster_removed = False
360
+
361
+ path_to_package = os.path.dirname(__file__)
362
+ down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
363
+
364
+ cwd = os.path.dirname(os.path.abspath(down_script_path))
365
+ run_command = f'{down_script_path} {name}'
366
+ run_command = shlex.split(run_command)
367
+
368
+ # Setup logging paths
369
+ run_timestamp = sky_logging.get_run_timestamp()
370
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
371
+ 'local_down.log')
372
+
373
+ with rich_utils.safe_status(
374
+ ux_utils.spinner_message(f'Removing local cluster {name}',
375
+ log_path=log_path,
376
+ is_local=True)):
377
+
378
+ returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
379
+ log_path=log_path,
380
+ require_outputs=True,
381
+ stream_logs=False,
382
+ cwd=cwd)
383
+ stderr = stderr.replace('No kind clusters found.\n', '')
384
+
385
+ if returncode == 0:
386
+ cluster_removed = True
387
+ elif returncode == 100:
388
+ logger.info(
389
+ ux_utils.error_message(f'Local cluster {name} does not exist.'))
390
+ else:
391
+ with ux_utils.print_exception_no_traceback():
392
+ raise RuntimeError(f'Failed to down local cluster {name}. '
393
+ f'Stdout: {stdout}'
394
+ f'\nError: {stderr}')
395
+ if cluster_removed:
396
+ # Run sky check
397
+ with rich_utils.safe_status(
398
+ ux_utils.spinner_message('Running sky check...')):
399
+ sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
400
+ clouds=['kubernetes'],
401
+ quiet=True)
402
+ logger.info(
403
+ ux_utils.finishing_message(f'Local cluster {name} removed.',
404
+ log_path=log_path,
405
+ is_local=True))
@@ -48,8 +48,16 @@ fi
48
48
 
49
49
  if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
50
50
  # If context is none, it means we are using incluster auth. In this case,
51
- # use need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
52
- kubectl exec -i "$resource_type/$resource_name" -n "$namespace" --kubeconfig=/dev/null -- "$@"
51
+ # we need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
52
+ kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --kubeconfig=/dev/null --"
53
53
  else
54
- kubectl exec -i "$resource_type/$resource_name" -n "$namespace" --context="$context" -- "$@"
54
+ kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --context=\"$context\" --"
55
55
  fi
56
+
57
+ # Execute command on remote pod, waiting for rsync to be available first.
58
+ # The waiting happens on the remote pod, not locally, which is more efficient
59
+ # and reliable than polling from the local machine.
60
+ # We wrap the command in a bash script that waits for rsync, then execs the original command.
61
+ # Timeout after MAX_WAIT_TIME_SECONDS seconds.
62
+ MAX_WAIT_TIME_SECONDS=300
63
+ eval "${kubectl_cmd_base% --} -i -- bash -c 'count=0; max_count=$MAX_WAIT_TIME_SECONDS*2; until which rsync >/dev/null 2>&1; do if [ \$count -ge \$max_count ]; then echo \"Error when trying to rsync files to kubernetes cluster. Package installation may have failed.\" >&2; exit 1; fi; sleep 0.5; count=\$((count+1)); done; exec \"\$@\"' -- \"\$@\""