skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/utils/db_utils.py DELETED
@@ -1,100 +0,0 @@
1
- """Utils for sky databases."""
2
- import contextlib
3
- import sqlite3
4
- import threading
5
- from typing import Any, Callable, Optional
6
-
7
- # This parameter (passed to sqlite3.connect) controls how long we will wait to
8
- # obtains a database lock (not necessarily during connection, but whenever it is
9
- # needed). It is not a connection timeout.
10
- # Even in WAL mode, only a single writer is allowed at a time. Other writers
11
- # will block until the write lock can be obtained. This behavior is described in
12
- # the SQLite documentation for WAL: https://www.sqlite.org/wal.html
13
- # Python's default timeout is 5s. In normal usage, lock contention is very low,
14
- # and this is more than sufficient. However, in some highly concurrent cases,
15
- # such as a jobs controller suddenly recovering thousands of jobs at once, we
16
- # can see a small number of processes that take much longer to obtain the lock.
17
- # In contrived highly contentious cases, around 0.1% of transactions will take
18
- # >30s to take the lock. We have not seen cases that take >60s. For cases up to
19
- # 1000x parallelism, this is thus thought to be a conservative setting.
20
- # For more info, see the PR description for #4552.
21
- _DB_TIMEOUT_S = 60
22
-
23
-
24
- @contextlib.contextmanager
25
- def safe_cursor(db_path: str):
26
- """A newly created, auto-committing, auto-closing cursor."""
27
- conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
28
- cursor = conn.cursor()
29
- try:
30
- yield cursor
31
- finally:
32
- cursor.close()
33
- conn.commit()
34
- conn.close()
35
-
36
-
37
- def add_column_to_table(
38
- cursor: 'sqlite3.Cursor',
39
- conn: 'sqlite3.Connection',
40
- table_name: str,
41
- column_name: str,
42
- column_type: str,
43
- copy_from: Optional[str] = None,
44
- value_to_replace_existing_entries: Optional[Any] = None,
45
- ):
46
- """Add a column to a table."""
47
- for row in cursor.execute(f'PRAGMA table_info({table_name})'):
48
- if row[1] == column_name:
49
- break
50
- else:
51
- try:
52
- add_column_cmd = (f'ALTER TABLE {table_name} '
53
- f'ADD COLUMN {column_name} {column_type}')
54
- cursor.execute(add_column_cmd)
55
- if copy_from is not None:
56
- cursor.execute(f'UPDATE {table_name} '
57
- f'SET {column_name} = {copy_from}')
58
- if value_to_replace_existing_entries is not None:
59
- cursor.execute(
60
- f'UPDATE {table_name} '
61
- f'SET {column_name} = (?) '
62
- f'WHERE {column_name} IS NULL',
63
- (value_to_replace_existing_entries,))
64
- except sqlite3.OperationalError as e:
65
- if 'duplicate column name' in str(e):
66
- # We may be trying to add the same column twice, when
67
- # running multiple threads. This is fine.
68
- pass
69
- else:
70
- raise
71
- conn.commit()
72
-
73
-
74
- def rename_column(
75
- cursor: 'sqlite3.Cursor',
76
- conn: 'sqlite3.Connection',
77
- table_name: str,
78
- old_name: str,
79
- new_name: str,
80
- ):
81
- """Rename a column in a table."""
82
- # NOTE: This only works for sqlite3 >= 3.25.0. Be careful to use this.
83
-
84
- for row in cursor.execute(f'PRAGMA table_info({table_name})'):
85
- if row[1] == old_name:
86
- cursor.execute(f'ALTER TABLE {table_name} '
87
- f'RENAME COLUMN {old_name} to {new_name}')
88
- break
89
- conn.commit()
90
-
91
-
92
- class SQLiteConn(threading.local):
93
- """Thread-local connection to the sqlite3 database."""
94
-
95
- def __init__(self, db_path: str, create_table: Callable):
96
- super().__init__()
97
- self.db_path = db_path
98
- self.conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
99
- self.cursor = self.conn.cursor()
100
- create_table(self.cursor, self.conn)
@@ -1,308 +0,0 @@
1
- #!/bin/bash
2
- # Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script.
3
- set -e
4
-
5
- # Colors for nicer UX
6
- RED='\033[0;31m'
7
- GREEN='\033[0;32m'
8
- YELLOW='\033[1;33m'
9
- NC='\033[0m' # No color
10
-
11
- # Variables
12
- CLEANUP=false
13
- INSTALL_GPU=false
14
- POSITIONAL_ARGS=()
15
- PASSWORD=""
16
-
17
- # Process all arguments
18
- while [[ $# -gt 0 ]]; do
19
- case $1 in
20
- --cleanup)
21
- CLEANUP=true
22
- shift
23
- ;;
24
- --password)
25
- PASSWORD=$2
26
- shift
27
- shift
28
- ;;
29
- *)
30
- POSITIONAL_ARGS+=("$1")
31
- shift
32
- ;;
33
- esac
34
- done
35
-
36
- # Restore positional arguments in correct order
37
- set -- "${POSITIONAL_ARGS[@]}"
38
-
39
- # Assign positional arguments to variables
40
- IPS_FILE=$1
41
- USER=$2
42
- SSH_KEY=$3
43
- CONTEXT_NAME=${4:-default}
44
- K3S_TOKEN=mytoken # Any string can be used as the token
45
- # Create temporary askpass script for sudo
46
- ASKPASS_BLOCK="# Create temporary askpass script
47
- ASKPASS_SCRIPT=\$(mktemp)
48
- trap 'rm -f \$ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
49
- cat > \$ASKPASS_SCRIPT << EOF
50
- #!/bin/bash
51
- echo $PASSWORD
52
- EOF
53
- chmod 700 \$ASKPASS_SCRIPT
54
- # Use askpass
55
- export SUDO_ASKPASS=\$ASKPASS_SCRIPT
56
- "
57
-
58
- # Basic argument checks
59
- if [ -z "$IPS_FILE" ] || [ -z "$USER" ] || [ -z "$SSH_KEY" ]; then
60
- >&2 echo -e "${RED}Error: Missing required arguments.${NC}"
61
- >&2 echo "Usage: ./deploy_remote_cluster.sh ips.txt username path/to/ssh/key [context-name] [--cleanup] [--password password]"
62
- exit 1
63
- fi
64
-
65
- # Check if SSH key exists
66
- if [ ! -f "$SSH_KEY" ]; then
67
- >&2 echo -e "${RED}Error: SSH key not found: $SSH_KEY${NC}"
68
- exit 1
69
- fi
70
-
71
- # Check if IPs file exists
72
- if [ ! -f "$IPS_FILE" ]; then
73
- >&2 echo -e "${RED}Error: IPs file not found: $IPS_FILE${NC}"
74
- exit 1
75
- fi
76
-
77
- # Get head node and worker nodes from the IPs file
78
- HEAD_NODE=$(head -n 1 "$IPS_FILE")
79
- WORKER_NODES=$(tail -n +2 "$IPS_FILE")
80
-
81
- # Check if the IPs file is empty or not formatted correctly
82
- if [ -z "$HEAD_NODE" ]; then
83
- >&2 echo -e "${RED}Error: IPs file is empty or not formatted correctly.${NC}"
84
- exit 1
85
- fi
86
-
87
- # Function to show a progress message
88
- progress_message() {
89
- echo -e "${YELLOW}➜ $1${NC}"
90
- }
91
-
92
- # Step to display success
93
- success_message() {
94
- echo -e "${GREEN}✔ $1${NC}"
95
- }
96
-
97
- # Function to run a command on a remote machine via SSH
98
- run_remote() {
99
- local NODE_IP=$1
100
- local CMD=$2
101
- # echo -e "${YELLOW}Running command on $NODE_IP...${NC}"
102
- ssh -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$NODE_IP" "$CMD"
103
- }
104
-
105
- # Function to uninstall k3s and clean up the state on a remote machine
106
- cleanup_server_node() {
107
- local NODE_IP=$1
108
- echo -e "${YELLOW}Cleaning up head node $NODE_IP...${NC}"
109
- run_remote "$NODE_IP" "
110
- $ASKPASS_BLOCK
111
- echo 'Uninstalling k3s...' &&
112
- sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
113
- sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
114
- "
115
- echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
116
- }
117
-
118
- # Function to uninstall k3s and clean up the state on a remote machine
119
- cleanup_agent_node() {
120
- local NODE_IP=$1
121
- echo -e "${YELLOW}Cleaning up node $NODE_IP...${NC}"
122
- run_remote "$NODE_IP" "
123
- $ASKPASS_BLOCK
124
- echo 'Uninstalling k3s...' &&
125
- sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
126
- sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
127
- "
128
- echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
129
- }
130
-
131
- check_gpu() {
132
- local NODE_IP=$1
133
- if run_remote "$NODE_IP" "command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null"; then
134
- return 0 # GPU detected
135
- else
136
- return 1 # No GPU detected
137
- fi
138
- }
139
-
140
- # Pre-flight checks
141
- run_remote "$HEAD_NODE" "echo 'SSH connection successful'"
142
- # TODO: Add more pre-flight checks here, including checking if port 6443 is accessible
143
-
144
- # If --cleanup flag is set, uninstall k3s and exit
145
- if [ "$CLEANUP" == "true" ]; then
146
- echo -e "${YELLOW}Starting cleanup...${NC}"
147
-
148
- # Clean up head node
149
- cleanup_server_node "$HEAD_NODE"
150
-
151
- # Clean up worker nodes
152
- for NODE in $WORKER_NODES; do
153
- cleanup_agent_node "$NODE"
154
- done
155
-
156
- # Remove the context from local kubeconfig if it exists
157
- if [ -f "$HOME/.kube/config" ]; then
158
- progress_message "Removing context '$CONTEXT_NAME' from local kubeconfig..."
159
- kubectl config delete-context "$CONTEXT_NAME" 2>/dev/null || true
160
- kubectl config delete-cluster "$CONTEXT_NAME" 2>/dev/null || true
161
- kubectl config delete-user "$CONTEXT_NAME" 2>/dev/null || true
162
- # Update the current context to the first available context
163
- kubectl config use-context $(kubectl config view -o jsonpath='{.contexts[0].name}') 2>/dev/null || true
164
- success_message "Context '$CONTEXT_NAME' removed from local kubeconfig."
165
- fi
166
-
167
- echo -e "${GREEN}Cleanup completed successfully.${NC}"
168
- exit 0
169
- fi
170
-
171
- # Step 1: Install k3s on the head node
172
- progress_message "Deploying Kubernetes on head node ($HEAD_NODE)..."
173
- run_remote "$HEAD_NODE" "
174
- $ASKPASS_BLOCK
175
- curl -sfL https://get.k3s.io | K3S_TOKEN=$K3S_TOKEN sudo -E -A sh - &&
176
- mkdir -p ~/.kube &&
177
- sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
178
- sudo -A chown \$(id -u):\$(id -g) ~/.kube/config &&
179
- for i in {1..3}; do
180
- if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
181
- break
182
- else
183
- echo 'Waiting for nodes to be ready...'
184
- sleep 5
185
- fi
186
- done
187
- if [ \$i -eq 3 ]; then
188
- echo 'Failed to wait for nodes to be ready after 3 attempts'
189
- exit 1
190
- fi"
191
- success_message "K3s deployed on head node."
192
-
193
- # Check if head node has a GPU
194
- if check_gpu "$HEAD_NODE"; then
195
- echo -e "${YELLOW}GPU detected on head node ($HEAD_NODE).${NC}"
196
- INSTALL_GPU=true
197
- fi
198
-
199
- # Fetch the head node's internal IP (this will be passed to worker nodes)
200
- MASTER_ADDR=$(run_remote "$HEAD_NODE" "hostname -I | awk '{print \$1}'")
201
-
202
- echo -e "${GREEN}Master node internal IP: $MASTER_ADDR${NC}"
203
-
204
- # Step 2: Install k3s on worker nodes and join them to the master node
205
- for NODE in $WORKER_NODES; do
206
- progress_message "Deploying Kubernetes on worker node ($NODE)..."
207
- run_remote "$NODE" "
208
- $ASKPASS_BLOCK
209
- curl -sfL https://get.k3s.io | K3S_URL=https://$MASTER_ADDR:6443 K3S_TOKEN=$K3S_TOKEN sudo -E -A sh -"
210
- success_message "Kubernetes deployed on worker node ($NODE)."
211
-
212
- # Check if worker node has a GPU
213
- if check_gpu "$NODE"; then
214
- echo -e "${YELLOW}GPU detected on worker node ($NODE).${NC}"
215
- INSTALL_GPU=true
216
- fi
217
- done
218
- # Step 3: Configure local kubectl to connect to the cluster
219
- progress_message "Configuring local kubectl to connect to the cluster..."
220
-
221
- # Create temporary directory for kubeconfig operations
222
- TEMP_DIR=$(mktemp -d)
223
- TEMP_KUBECONFIG="$TEMP_DIR/kubeconfig"
224
-
225
- # Get the kubeconfig from remote server
226
- scp -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$HEAD_NODE":~/.kube/config "$TEMP_KUBECONFIG"
227
-
228
- # Create .kube directory if it doesn't exist
229
- mkdir -p "$HOME/.kube"
230
-
231
- # Create empty kubeconfig if it doesn't exist
232
- KUBECONFIG_FILE="$HOME/.kube/config"
233
- if [[ ! -f "$KUBECONFIG_FILE" ]]; then
234
- touch "$KUBECONFIG_FILE"
235
- fi
236
-
237
- # Modify the temporary kubeconfig to update server address and context name
238
- awk -v context="$CONTEXT_NAME" '
239
- /^clusters:/ { in_cluster = 1 }
240
- /^users:/ { in_cluster = 0 }
241
- in_cluster && /^ *certificate-authority-data:/ { next }
242
- in_cluster && /^ *server:/ {
243
- print " server: https://'${HEAD_NODE}:6443'"
244
- print " insecure-skip-tls-verify: true"
245
- next
246
- }
247
- /name: default/ { sub("name: default", "name: " context) }
248
- /cluster: default/ { sub("cluster: default", "cluster: " context) }
249
- /user: default/ { sub("user: default", "user: " context) }
250
- /current-context: default/ { sub("current-context: default", "current-context: " context) }
251
- { print }
252
- ' "$TEMP_KUBECONFIG" > "$TEMP_DIR/modified_config"
253
-
254
- # Merge the configurations using kubectl
255
- KUBECONFIG="$KUBECONFIG_FILE:$TEMP_DIR/modified_config" kubectl config view --flatten > "$TEMP_DIR/merged_config"
256
- mv "$TEMP_DIR/merged_config" "$KUBECONFIG_FILE"
257
-
258
- # Set the new context as the current context
259
- kubectl config use-context "$CONTEXT_NAME"
260
-
261
- # Clean up temporary files
262
- rm -rf "$TEMP_DIR"
263
-
264
- success_message "kubectl configured with new context '$CONTEXT_NAME'."
265
-
266
- echo "Cluster deployment completed. You can now run 'kubectl get nodes' to verify the setup."
267
-
268
- # Install GPU operator if a GPU was detected on any node
269
- if [ "$INSTALL_GPU" == "true" ]; then
270
- echo -e "${YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...${NC}"
271
- run_remote "$HEAD_NODE" "
272
- $ASKPASS_BLOCK
273
- curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
274
- chmod 700 get_helm.sh &&
275
- ./get_helm.sh &&
276
- helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
277
- kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
278
- sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
279
- helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \
280
- --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \
281
- --set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \
282
- --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \
283
- --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \
284
- --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \
285
- --set 'toolkit.env[2].value=nvidia' &&
286
- echo 'Waiting for GPU operator installation...' &&
287
- while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:'; do
288
- echo 'Waiting for GPU operator...'
289
- sleep 5
290
- done
291
- echo 'GPU operator installed successfully.'"
292
- success_message "GPU Operator installed."
293
- else
294
- echo -e "${YELLOW}No GPUs detected. Skipping GPU Operator installation.${NC}"
295
- fi
296
-
297
- # Configure SkyPilot
298
- progress_message "Configuring SkyPilot..."
299
- sky check kubernetes
300
- success_message "SkyPilot configured successfully."
301
-
302
- # Display final success message
303
- echo -e "${GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}"
304
- echo "You can now interact with your Kubernetes cluster through SkyPilot: "
305
- echo " • List available GPUs: sky show-gpus --cloud kubernetes"
306
- echo " • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes --gpus A100:1"
307
- echo " • Connect to pod with SSH: ssh devbox"
308
- echo " • Connect to pod with VSCode: code --remote ssh-remote+devbox '/'"
@@ -1,191 +0,0 @@
1
- """Manages lifecycle of ssh jump pod.
2
-
3
- This script runs inside ssh jump pod as the main process (PID 1).
4
-
5
- It terminates itself (by removing ssh jump service and pod via a call to
6
- kubeapi) if it does not see ray pods in the duration of 10 minutes. If the
7
- user re-launches a task before the duration is over, then ssh jump pod is being
8
- reused and will terminate itself when it sees that no ray clusters exist in
9
- that duration.
10
-
11
- To allow multiple users to the share the same SSH jump pod,
12
- this script also reloads SSH keys from the mounted secret volume on an
13
- interval and updates `~/.ssh/authorized_keys`.
14
- """
15
- import datetime
16
- import os
17
- import subprocess
18
- import sys
19
- import threading
20
- import time
21
-
22
- from kubernetes import client
23
- from kubernetes import config
24
-
25
- # Load kube config
26
- config.load_incluster_config()
27
-
28
- v1 = client.CoreV1Api()
29
-
30
- current_name = os.getenv('MY_POD_NAME')
31
- current_namespace = os.getenv('MY_POD_NAMESPACE')
32
-
33
- # The amount of time in seconds where no Ray pods exist in which after that time
34
- # ssh jump pod terminates itself
35
- alert_threshold = int(os.getenv('ALERT_THRESHOLD', '600'))
36
- # The amount of time in seconds to wait between Ray pods existence checks
37
- retry_interval = int(os.getenv('RETRY_INTERVAL', '60'))
38
- # The amount of time in seconds to wait between SSH key reloads
39
- reload_interval = int(os.getenv('RELOAD_INTERVAL', '5'))
40
-
41
- # Ray pods are labeled with this value i.e., ssh jump name which is unique per
42
- # user (based on user hash)
43
- label_selector = f'skypilot-ssh-jump={current_name}'
44
-
45
-
46
- def poll(interval, leading=True):
47
- """Decorator factory for polling function. To stop polling, return True.
48
-
49
- Args:
50
- interval (int): The amount of time to wait between function calls.
51
- leading (bool): Whether to wait before (rather than after) calls.
52
- """
53
-
54
- def decorator(func):
55
-
56
- def wrapper(*args, **kwargs):
57
- while True:
58
- if leading:
59
- time.sleep(interval)
60
- done = func(*args, **kwargs)
61
- if done:
62
- return
63
- if not leading:
64
- time.sleep(interval)
65
-
66
- return wrapper
67
-
68
- return decorator
69
-
70
-
71
- # Flag to terminate the reload keys thread when the lifecycle thread
72
- # terminates.
73
- terminated = False
74
-
75
-
76
- @poll(interval=reload_interval, leading=False)
77
- def reload_keys():
78
- """Reloads SSH keys from mounted secret volume."""
79
-
80
- if terminated:
81
- sys.stdout.write('[SSH Key Reloader] Terminated.\n')
82
- return True
83
-
84
- # Reload SSH keys from mounted secret volume if changed.
85
- tmpfile = '/tmp/sky-ssh-keys'
86
- try:
87
- subprocess.check_output(
88
- f'cat /etc/secret-volume/ssh-publickey* > {tmpfile}', shell=True)
89
- try:
90
- subprocess.check_output(f'diff {tmpfile} ~/.ssh/authorized_keys',
91
- shell=True)
92
- sys.stdout.write(
93
- '[SSH Key Reloader] No keys changed, continuing.\n')
94
- except subprocess.CalledProcessError as e:
95
- if e.returncode == 1:
96
- sys.stdout.write(
97
- '[SSH Key Reloader] Changes detected, reloading.\n')
98
- subprocess.check_output(f'mv {tmpfile} ~/.ssh/authorized_keys',
99
- shell=True)
100
- else:
101
- raise
102
- except Exception as e:
103
- sys.stdout.write(
104
- f'[SSH Key Reloader][ERROR] Failed to reload SSH keys: {e}\n')
105
- raise
106
-
107
-
108
- alert_delta = datetime.timedelta(seconds=alert_threshold)
109
- retry_interval_delta = datetime.timedelta(seconds=retry_interval)
110
- # Accumulated time of where no SkyPilot cluster exists. Compared
111
- # against alert_threshold.
112
- nocluster_delta = datetime.timedelta()
113
-
114
-
115
- @poll(interval=retry_interval)
116
- def manage_lifecycle():
117
- """Manages lifecycle of ssh jump pod."""
118
-
119
- global terminated, nocluster_delta
120
-
121
- try:
122
- ret = v1.list_namespaced_pod(current_namespace,
123
- label_selector=label_selector)
124
- except Exception as e:
125
- sys.stdout.write('[Lifecycle] [ERROR] listing pods failed with '
126
- f'error: {e}\n')
127
- raise
128
-
129
- if not ret.items:
130
- sys.stdout.write(
131
- f'[Lifecycle] Did not find pods with label '
132
- f'"{label_selector}" in namespace {current_namespace}\n')
133
- nocluster_delta = nocluster_delta + retry_interval_delta
134
- sys.stdout.write(
135
- f'[Lifecycle] Time since no pods found: {nocluster_delta}, alert '
136
- f'threshold: {alert_delta}\n')
137
- else:
138
- sys.stdout.write(
139
- f'[Lifecycle] Found pods with label "{label_selector}" in '
140
- f'namespace {current_namespace}\n')
141
- # reset ..
142
- nocluster_delta = datetime.timedelta()
143
- sys.stdout.write(
144
- f'[Lifecycle] nocluster_delta is reset: {nocluster_delta}\n')
145
-
146
- if nocluster_delta >= alert_delta:
147
- sys.stdout.write(
148
- f'[Lifecycle] nocluster_delta: {nocluster_delta} crossed alert '
149
- f'threshold: {alert_delta}. Time to terminate myself and my '
150
- 'service.\n')
151
- try:
152
- # ssh jump resources created under same name
153
- v1.delete_namespaced_service(current_name, current_namespace)
154
- v1.delete_namespaced_pod(current_name, current_namespace)
155
- except Exception as e:
156
- sys.stdout.write('[Lifecycle][ERROR] Deletion failed. Exiting '
157
- f'poll() with error: {e}\n')
158
- raise
159
-
160
- terminated = True
161
- return True
162
-
163
-
164
- def main():
165
- sys.stdout.write('SkyPilot SSH Jump Pod Lifecycle Manager\n')
166
- sys.stdout.write(f'current_name: {current_name}\n')
167
- sys.stdout.write(f'current_namespace: {current_namespace}\n')
168
- sys.stdout.write(f'alert_threshold time: {alert_threshold}\n')
169
- sys.stdout.write(f'retry_interval time: {retry_interval}\n')
170
- sys.stdout.write(f'reload_interval time: {reload_interval}\n')
171
- sys.stdout.write(f'label_selector: {label_selector}\n')
172
-
173
- if not current_name or not current_namespace:
174
- # Raise Exception with message to terminate pod
175
- raise Exception('Missing environment variables MY_POD_NAME or '
176
- 'MY_POD_NAMESPACE')
177
-
178
- threads = [
179
- threading.Thread(target=manage_lifecycle),
180
- threading.Thread(target=reload_keys)
181
- ]
182
- sys.stdout.write(f'Polling with {len(threads)} threads.\n')
183
- for t in threads:
184
- t.start()
185
- for t in threads:
186
- t.join()
187
- sys.stdout.write('Done.\n')
188
-
189
-
190
- if __name__ == '__main__':
191
- main()