skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,14 @@
1
1
  """Utility functions for deploying Kubernetes clusters."""
2
2
  import os
3
+ import random
3
4
  import shlex
4
5
  import subprocess
6
+ import sys
5
7
  import tempfile
6
- from typing import List, Optional
8
+ import textwrap
9
+ from typing import List, Optional, Tuple
10
+
11
+ import colorama
7
12
 
8
13
  from sky import check as sky_check
9
14
  from sky import sky_logging
@@ -19,6 +24,151 @@ from sky.utils import ux_utils
19
24
 
20
25
  logger = sky_logging.init_logger(__name__)
21
26
 
27
+ # Default path for Kubernetes configuration file
28
+ DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
29
+ DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
30
+ LOCAL_CLUSTER_PORT_RANGE = 100
31
+ LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
32
+ LOCAL_CLUSTER_INTERNAL_PORT_END = 30099
33
+
34
+
35
+ def check_ssh_cluster_dependencies(
36
+ raise_error: bool = True) -> Optional[List[str]]:
37
+ """Checks if the dependencies for ssh cluster are installed.
38
+
39
+ Args:
40
+ raise_error: set to true when the dependency needs to be present.
41
+ set to false for `sky check`, where reason strings are compiled
42
+ at the end.
43
+
44
+ Returns: the reasons list if there are missing dependencies.
45
+ """
46
+ # error message
47
+ jq_message = ('`jq` is required to setup ssh cluster.')
48
+
49
+ # save
50
+ reasons = []
51
+ required_binaries = []
52
+
53
+ # Ensure jq is installed
54
+ try:
55
+ subprocess.run(['jq', '--version'],
56
+ stdout=subprocess.DEVNULL,
57
+ stderr=subprocess.DEVNULL,
58
+ check=True)
59
+ except (FileNotFoundError, subprocess.CalledProcessError):
60
+ required_binaries.append('jq')
61
+ reasons.append(jq_message)
62
+
63
+ if required_binaries:
64
+ reasons.extend([
65
+ 'On Debian/Ubuntu, install the missing dependenc(ies) with:',
66
+ f' $ sudo apt install {" ".join(required_binaries)}',
67
+ 'On MacOS, install with: ',
68
+ f' $ brew install {" ".join(required_binaries)}',
69
+ ])
70
+ if raise_error:
71
+ with ux_utils.print_exception_no_traceback():
72
+ raise RuntimeError('\n'.join(reasons))
73
+ return reasons
74
+ return None
75
+
76
+
77
+ def deploy_ssh_cluster(cleanup: bool = False,
78
+ infra: Optional[str] = None,
79
+ kubeconfig_path: Optional[str] = None):
80
+ """Deploy a Kubernetes cluster on SSH targets.
81
+
82
+ This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
83
+ Kubernetes cluster on the specified machines.
84
+
85
+ Args:
86
+ cleanup: Whether to clean up the cluster instead of deploying.
87
+ infra: Name of the cluster in ssh_node_pools.yaml to use.
88
+ If None, the first cluster in the file will be used.
89
+ kubeconfig_path: Path to save the Kubernetes configuration file.
90
+ If None, the default ~/.kube/config will be used.
91
+ """
92
+ check_ssh_cluster_dependencies()
93
+
94
+ # Prepare command to call deploy_remote_cluster.py script
95
+ # TODO(romilb): We should move this to a native python method/class call
96
+ # instead of invoking a script with subprocess.
97
+ path_to_package = os.path.dirname(__file__)
98
+ up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
99
+ cwd = os.path.dirname(os.path.abspath(up_script_path))
100
+
101
+ deploy_command = [sys.executable, up_script_path]
102
+
103
+ if cleanup:
104
+ deploy_command.append('--cleanup')
105
+
106
+ if infra:
107
+ deploy_command.extend(['--infra', infra])
108
+
109
+ # Use the default kubeconfig path if none is provided
110
+ kubeconfig_path = kubeconfig_path or DEFAULT_KUBECONFIG_PATH
111
+ deploy_command.extend(['--kubeconfig-path', kubeconfig_path])
112
+
113
+ # Setup logging paths
114
+ run_timestamp = sky_logging.get_run_timestamp()
115
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
116
+ 'ssh_up.log')
117
+
118
+ if cleanup:
119
+ msg_str = 'Cleaning up SSH Node Pools...'
120
+ else:
121
+ msg_str = 'Initializing deployment to SSH Node Pools...'
122
+
123
+ # Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
124
+ env = os.environ.copy()
125
+ env['PYTHONUNBUFFERED'] = '1'
126
+
127
+ with rich_utils.safe_status(
128
+ ux_utils.spinner_message(msg_str, log_path=log_path,
129
+ is_local=True)):
130
+ returncode, _, stderr = log_lib.run_with_log(
131
+ cmd=deploy_command,
132
+ log_path=log_path,
133
+ require_outputs=True,
134
+ stream_logs=False,
135
+ line_processor=log_utils.SkySSHUpLineProcessor(log_path=log_path,
136
+ is_local=False),
137
+ cwd=cwd,
138
+ env=env)
139
+
140
+ if returncode == 0:
141
+ success = True
142
+ else:
143
+ with ux_utils.print_exception_no_traceback():
144
+ log_hint = ux_utils.log_path_hint(log_path, is_local=False)
145
+ raise RuntimeError('Failed to deploy SkyPilot on some Node Pools. '
146
+ f'{log_hint}'
147
+ f'\nError: {stderr}')
148
+
149
+ if success:
150
+ # Add an empty line to separate the deployment logs from the final
151
+ # message
152
+ logger.info('')
153
+ if cleanup:
154
+ logger.info(
155
+ ux_utils.finishing_message(
156
+ '🎉 SSH Node Pools cleaned up successfully.',
157
+ log_path=log_path,
158
+ is_local=True))
159
+ else:
160
+ logger.info(
161
+ ux_utils.finishing_message(
162
+ '🎉 SSH Node Pools set up successfully. ',
163
+ follow_up_message=(
164
+ f'Run `{colorama.Style.BRIGHT}'
165
+ f'sky check ssh'
166
+ f'{colorama.Style.RESET_ALL}` to verify access, '
167
+ f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
168
+ f'{colorama.Style.RESET_ALL}` to launch a cluster. '),
169
+ log_path=log_path,
170
+ is_local=True))
171
+
22
172
 
23
173
  def deploy_remote_cluster(ip_list: List[str],
24
174
  ssh_user: str,
@@ -28,7 +178,7 @@ def deploy_remote_cluster(ip_list: List[str],
28
178
  password: Optional[str] = None):
29
179
  success = False
30
180
  path_to_package = os.path.dirname(__file__)
31
- up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.sh')
181
+ up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
32
182
  # Get directory of script and run it from there
33
183
  cwd = os.path.dirname(os.path.abspath(up_script_path))
34
184
 
@@ -44,17 +194,18 @@ def deploy_remote_cluster(ip_list: List[str],
44
194
  key_file.flush()
45
195
  os.chmod(key_file.name, 0o600)
46
196
 
47
- deploy_command = (f'{up_script_path} {ip_file.name} '
48
- f'{ssh_user} {key_file.name}')
197
+ # Use the legacy mode command line arguments for backward compatibility
198
+ deploy_command = [
199
+ sys.executable, up_script_path, '--ips-file', ip_file.name,
200
+ '--user', ssh_user, '--ssh-key', key_file.name
201
+ ]
202
+
49
203
  if context_name is not None:
50
- deploy_command += f' {context_name}'
204
+ deploy_command.extend(['--context-name', context_name])
51
205
  if password is not None:
52
- deploy_command += f' --password {password}'
206
+ deploy_command.extend(['--password', password])
53
207
  if cleanup:
54
- deploy_command += ' --cleanup'
55
-
56
- # Convert the command to a format suitable for subprocess
57
- deploy_command = shlex.split(deploy_command)
208
+ deploy_command.append('--cleanup')
58
209
 
59
210
  # Setup logging paths
60
211
  run_timestamp = sky_logging.get_run_timestamp()
@@ -65,6 +216,11 @@ def deploy_remote_cluster(ip_list: List[str],
65
216
  msg_str = 'Cleaning up remote cluster...'
66
217
  else:
67
218
  msg_str = 'Deploying remote cluster...'
219
+
220
+ # Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
221
+ env = os.environ.copy()
222
+ env['PYTHONUNBUFFERED'] = '1'
223
+
68
224
  with rich_utils.safe_status(
69
225
  ux_utils.spinner_message(msg_str,
70
226
  log_path=log_path,
@@ -76,7 +232,8 @@ def deploy_remote_cluster(ip_list: List[str],
76
232
  stream_logs=False,
77
233
  line_processor=log_utils.SkyRemoteUpLineProcessor(
78
234
  log_path=log_path, is_local=True),
79
- cwd=cwd)
235
+ cwd=cwd,
236
+ env=env)
80
237
  if returncode == 0:
81
238
  success = True
82
239
  else:
@@ -101,7 +258,93 @@ def deploy_remote_cluster(ip_list: List[str],
101
258
  is_local=True))
102
259
 
103
260
 
104
- def deploy_local_cluster(gpus: bool):
261
+ def generate_kind_config(port_start: int,
262
+ num_nodes: int = 1,
263
+ gpus: bool = False) -> str:
264
+ """Generate a kind cluster config with ports mapped from host to container
265
+
266
+ Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
267
+ Internally, this will map to ports 30000 - 30099
268
+
269
+ Args:
270
+ path: Path to generate the config file at
271
+ port_start: Port range start for mappings
272
+ num_nodes: Number of nodes in the cluster
273
+ gpus: If true, initialize kind cluster with GPU support
274
+
275
+ Returns:
276
+ The kind cluster config
277
+ """
278
+ internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
279
+ internal_end = LOCAL_CLUSTER_INTERNAL_PORT_END
280
+
281
+ config = textwrap.dedent(f"""
282
+ apiVersion: kind.x-k8s.io/v1alpha4
283
+ kind: Cluster
284
+ kubeadmConfigPatches:
285
+ - |
286
+ kind: ClusterConfiguration
287
+ apiServer:
288
+ extraArgs:
289
+ "service-node-port-range": {internal_start}-{internal_end}
290
+ nodes:
291
+ - role: control-plane
292
+ kubeadmConfigPatches:
293
+ - |
294
+ kind: InitConfiguration
295
+ nodeRegistration:
296
+ kubeletExtraArgs:
297
+ node-labels: "ingress-ready=true"
298
+ """)
299
+ if gpus:
300
+ config += textwrap.indent(
301
+ textwrap.dedent("""
302
+ extraMounts:
303
+ - hostPath: /dev/null
304
+ containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
305
+ config += textwrap.indent(textwrap.dedent("""
306
+ extraPortMappings:"""), ' ' * 2)
307
+ for offset in range(LOCAL_CLUSTER_PORT_RANGE):
308
+ config += textwrap.indent(
309
+ textwrap.dedent(f"""
310
+ - containerPort: {internal_start + offset}
311
+ hostPort: {port_start + offset}
312
+ listenAddress: "0.0.0.0"
313
+ protocol: tcp
314
+ """), ' ' * 2)
315
+ if num_nodes > 1:
316
+ config += '- role: worker\n' * (num_nodes - 1)
317
+ return config
318
+
319
+
320
+ def _get_port_range(name: str, port_start: Optional[int]) -> Tuple[int, int]:
321
+ is_default = name == DEFAULT_LOCAL_CLUSTER_NAME
322
+ if port_start is None:
323
+ if is_default:
324
+ port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
325
+ else:
326
+ port_start = random.randint(301, 399) * 100
327
+ port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
328
+
329
+ port_range = f'Current port range: {port_start}-{port_end}'
330
+ if is_default and port_start != LOCAL_CLUSTER_INTERNAL_PORT_START:
331
+ raise ValueError('Default local cluster `skypilot` should have '
332
+ f'port range from 30000 to 30099. {port_range}.')
333
+ if not is_default and port_start == LOCAL_CLUSTER_INTERNAL_PORT_START:
334
+ raise ValueError('Port range 30000 to 30099 is reserved for '
335
+ f'default local cluster `skypilot`. {port_range}.')
336
+ if port_start % 100 != 0:
337
+ raise ValueError('Local cluster port start must be a multiple of 100. '
338
+ f'{port_range}.')
339
+
340
+ return port_start, port_end
341
+
342
+
343
+ def deploy_local_cluster(name: Optional[str], port_start: Optional[int],
344
+ gpus: bool):
345
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
346
+ port_start, port_end = _get_port_range(name, port_start)
347
+ context_name = f'kind-{name}'
105
348
  cluster_created = False
106
349
 
107
350
  # Check if GPUs are available on the host
@@ -111,41 +354,52 @@ def deploy_local_cluster(gpus: bool):
111
354
  # Check if ~/.kube/config exists:
112
355
  if os.path.exists(os.path.expanduser('~/.kube/config')):
113
356
  curr_context = kubernetes_utils.get_current_kube_config_context_name()
114
- skypilot_context = 'kind-skypilot'
115
- if curr_context is not None and curr_context != skypilot_context:
357
+ if curr_context is not None and curr_context != context_name:
116
358
  logger.info(
117
359
  f'Current context in kube config: {curr_context}'
118
- '\nWill automatically switch to kind-skypilot after the local '
119
- 'cluster is created.')
120
- message_str = 'Creating local cluster{}...'
121
- message_str = message_str.format((' with GPU support (this may take up '
122
- 'to 15 minutes)') if gpus else '')
123
- path_to_package = os.path.dirname(__file__)
124
- up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
360
+ f'\nWill automatically switch to {context_name} after the '
361
+ 'local cluster is created.')
362
+ message_str = 'Creating local cluster {}{}...'
363
+ message_str = message_str.format(
364
+ name,
365
+ ' with GPU support (this may take up to 15 minutes)' if gpus else '')
125
366
 
126
- # Get directory of script and run it from there
127
- cwd = os.path.dirname(os.path.abspath(up_script_path))
128
- run_command = up_script_path + ' --gpus' if gpus else up_script_path
129
- run_command = shlex.split(run_command)
367
+ with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
368
+ delete=True) as f:
369
+ # Choose random port range to use on the host machine.
370
+ # Port range is port_start - port_start + 99 (exactly 100 ports).
371
+ logger.debug(f'Using host port range {port_start}-{port_end}')
372
+ f.write(generate_kind_config(port_start, gpus=gpus))
373
+ f.flush()
130
374
 
131
- # Setup logging paths
132
- run_timestamp = sky_logging.get_run_timestamp()
133
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
134
- 'local_up.log')
135
- logger.info(message_str)
375
+ path_to_package = os.path.dirname(__file__)
376
+ up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
136
377
 
137
- with rich_utils.safe_status(
138
- ux_utils.spinner_message(message_str,
139
- log_path=log_path,
140
- is_local=True)):
141
- returncode, _, stderr = log_lib.run_with_log(
142
- cmd=run_command,
143
- log_path=log_path,
144
- require_outputs=True,
145
- stream_logs=False,
146
- line_processor=log_utils.SkyLocalUpLineProcessor(log_path=log_path,
147
- is_local=True),
148
- cwd=cwd)
378
+ # Get directory of script and run it from there
379
+ cwd = os.path.dirname(os.path.abspath(up_script_path))
380
+ run_command = f'{up_script_path} {name} {f.name}'
381
+ if gpus:
382
+ run_command += ' --gpus'
383
+ run_command = shlex.split(run_command)
384
+
385
+ # Setup logging paths
386
+ run_timestamp = sky_logging.get_run_timestamp()
387
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
388
+ 'local_up.log')
389
+ logger.info(message_str)
390
+
391
+ with rich_utils.safe_status(
392
+ ux_utils.spinner_message(message_str,
393
+ log_path=log_path,
394
+ is_local=True)):
395
+ returncode, _, stderr = log_lib.run_with_log(
396
+ cmd=run_command,
397
+ log_path=log_path,
398
+ require_outputs=True,
399
+ stream_logs=False,
400
+ line_processor=log_utils.SkyLocalUpLineProcessor(
401
+ log_path=log_path, is_local=True),
402
+ cwd=cwd)
149
403
 
150
404
  # Kind always writes to stderr even if it succeeds.
151
405
  # If the failure happens after the cluster is created, we need
@@ -158,11 +412,11 @@ def deploy_local_cluster(gpus: bool):
158
412
  elif returncode == 100:
159
413
  logger.info(
160
414
  ux_utils.finishing_message(
161
- 'Local cluster already exists.\n',
415
+ f'Local cluster {name} already exists.\n',
162
416
  log_path=log_path,
163
417
  is_local=True,
164
418
  follow_up_message=
165
- 'If you want to delete it instead, run: sky local down'))
419
+ 'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
166
420
  else:
167
421
  with ux_utils.print_exception_no_traceback():
168
422
  log_hint = ux_utils.log_path_hint(log_path, is_local=True)
@@ -188,7 +442,7 @@ def deploy_local_cluster(gpus: bool):
188
442
  if gpus:
189
443
  # Get GPU model by querying the node labels
190
444
  label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
191
- gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
445
+ gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
192
446
  try:
193
447
  # Run the command and capture the output
194
448
  gpu_count_output = subprocess.check_output(gpu_type_cmd,
@@ -224,8 +478,10 @@ def deploy_local_cluster(gpus: bool):
224
478
  'This may cause issues with running tasks.')
225
479
  logger.info(
226
480
  ux_utils.finishing_message(
227
- message=(f'Local Kubernetes cluster created successfully with '
228
- f'{num_cpus} CPUs{gpu_message}.'),
481
+ message=(
482
+ f'Local Kubernetes cluster {name} created successfully '
483
+ f'with {num_cpus} CPUs{gpu_message} on host port range '
484
+ f'{port_start}-{port_end}.'),
229
485
  log_path=log_path,
230
486
  is_local=True,
231
487
  follow_up_message=(
@@ -233,3 +489,54 @@ def deploy_local_cluster(gpus: bool):
233
489
  'Hint: To change the number of CPUs, change your docker '
234
490
  'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
235
491
  f'{gpu_hint}')))
492
+
493
+
494
+ def teardown_local_cluster(name: Optional[str] = None):
495
+ name = name or DEFAULT_LOCAL_CLUSTER_NAME
496
+ cluster_removed = False
497
+
498
+ path_to_package = os.path.dirname(__file__)
499
+ down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
500
+
501
+ cwd = os.path.dirname(os.path.abspath(down_script_path))
502
+ run_command = f'{down_script_path} {name}'
503
+ run_command = shlex.split(run_command)
504
+
505
+ # Setup logging paths
506
+ run_timestamp = sky_logging.get_run_timestamp()
507
+ log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
508
+ 'local_down.log')
509
+
510
+ with rich_utils.safe_status(
511
+ ux_utils.spinner_message(f'Removing local cluster {name}',
512
+ log_path=log_path,
513
+ is_local=True)):
514
+
515
+ returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
516
+ log_path=log_path,
517
+ require_outputs=True,
518
+ stream_logs=False,
519
+ cwd=cwd)
520
+ stderr = stderr.replace('No kind clusters found.\n', '')
521
+
522
+ if returncode == 0:
523
+ cluster_removed = True
524
+ elif returncode == 100:
525
+ logger.info(
526
+ ux_utils.error_message(f'Local cluster {name} does not exist.'))
527
+ else:
528
+ with ux_utils.print_exception_no_traceback():
529
+ raise RuntimeError(f'Failed to down local cluster {name}. '
530
+ f'Stdout: {stdout}'
531
+ f'\nError: {stderr}')
532
+ if cluster_removed:
533
+ # Run sky check
534
+ with rich_utils.safe_status(
535
+ ux_utils.spinner_message('Running sky check...')):
536
+ sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
537
+ clouds=['kubernetes'],
538
+ quiet=True)
539
+ logger.info(
540
+ ux_utils.finishing_message(f'Local cluster {name} removed.',
541
+ log_path=log_path,
542
+ is_local=True))