skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/authentication.py CHANGED
@@ -19,37 +19,34 @@ controller. (Lambda cloud is an exception, due to the limitation of the cloud
19
19
  provider. See the comments in setup_lambda_authentication)
20
20
  """
21
21
  import copy
22
- import functools
23
22
  import os
24
23
  import re
25
24
  import socket
26
25
  import subprocess
27
26
  import sys
28
- import typing
29
- from typing import Any, Dict, Tuple
27
+ from typing import Any, Dict
30
28
  import uuid
31
29
 
32
30
  import colorama
33
- import filelock
34
31
 
35
32
  from sky import clouds
36
33
  from sky import exceptions
37
34
  from sky import sky_logging
38
- from sky import skypilot_config
39
- from sky.adaptors import common as adaptors_common
40
35
  from sky.adaptors import gcp
41
36
  from sky.adaptors import ibm
42
- from sky.adaptors import kubernetes
43
37
  from sky.adaptors import runpod
38
+ from sky.adaptors import seeweb as seeweb_adaptor
39
+ from sky.adaptors import shadeform as shadeform_adaptor
44
40
  from sky.adaptors import vast
45
41
  from sky.provision.fluidstack import fluidstack_utils
46
42
  from sky.provision.kubernetes import utils as kubernetes_utils
47
43
  from sky.provision.lambda_cloud import lambda_utils
44
+ from sky.provision.primeintellect import utils as primeintellect_utils
45
+ from sky.utils import auth_utils
48
46
  from sky.utils import common_utils
49
- from sky.utils import config_utils
50
- from sky.utils import kubernetes_enums
51
47
  from sky.utils import subprocess_utils
52
48
  from sky.utils import ux_utils
49
+ from sky.utils import yaml_utils
53
50
 
54
51
  logger = sky_logging.init_logger(__name__)
55
52
 
@@ -58,114 +55,38 @@ logger = sky_logging.init_logger(__name__)
58
55
  # using Cloud Client Libraries for Python, where possible, for new code
59
56
  # development.
60
57
 
61
- MAX_TRIALS = 64
62
- # TODO(zhwu): Support user specified key pair.
63
- # We intentionally not have the ssh key pair to be stored in
64
- # ~/.sky/api_server/clients, i.e. sky.server.common.API_SERVER_CLIENT_DIR,
65
- # because ssh key pair need to persist across API server restarts, while
66
- # the former dir is empheral.
67
- _SSH_KEY_PATH_PREFIX = '~/.sky/clients/{user_hash}/ssh'
68
-
69
- if typing.TYPE_CHECKING:
70
- import yaml
71
- else:
72
- yaml = adaptors_common.LazyImport('yaml')
73
-
74
-
75
- def get_ssh_key_and_lock_path() -> Tuple[str, str, str]:
76
- user_hash = common_utils.get_user_hash()
77
- user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
78
-
79
- os.makedirs(os.path.expanduser(user_ssh_key_prefix),
80
- exist_ok=True,
81
- mode=0o700)
82
- private_key_path = os.path.join(user_ssh_key_prefix, 'sky-key')
83
- public_key_path = os.path.join(user_ssh_key_prefix, 'sky-key.pub')
84
- lock_path = os.path.join(user_ssh_key_prefix, '.__internal-sky-key.lock')
85
- return private_key_path, public_key_path, lock_path
86
-
87
-
88
- def _generate_rsa_key_pair() -> Tuple[str, str]:
89
- # Keep the import of the cryptography local to avoid expensive
90
- # third-party imports when not needed.
91
- # pylint: disable=import-outside-toplevel
92
- from cryptography.hazmat.backends import default_backend
93
- from cryptography.hazmat.primitives import serialization
94
- from cryptography.hazmat.primitives.asymmetric import rsa
95
-
96
- key = rsa.generate_private_key(backend=default_backend(),
97
- public_exponent=65537,
98
- key_size=2048)
99
-
100
- private_key = key.private_bytes(
101
- encoding=serialization.Encoding.PEM,
102
- format=serialization.PrivateFormat.TraditionalOpenSSL,
103
- encryption_algorithm=serialization.NoEncryption()).decode(
104
- 'utf-8').strip()
105
-
106
- public_key = key.public_key().public_bytes(
107
- serialization.Encoding.OpenSSH,
108
- serialization.PublicFormat.OpenSSH).decode('utf-8').strip()
109
-
110
- return public_key, private_key
111
-
112
-
113
- def _save_key_pair(private_key_path: str, public_key_path: str,
114
- private_key: str, public_key: str) -> None:
115
- key_dir = os.path.dirname(private_key_path)
116
- os.makedirs(key_dir, exist_ok=True, mode=0o700)
117
-
118
- with open(
119
- private_key_path,
120
- 'w',
121
- encoding='utf-8',
122
- opener=functools.partial(os.open, mode=0o600),
123
- ) as f:
124
- f.write(private_key)
125
-
126
- with open(public_key_path,
127
- 'w',
128
- encoding='utf-8',
129
- opener=functools.partial(os.open, mode=0o644)) as f:
130
- f.write(public_key)
131
-
132
-
133
- def get_or_generate_keys() -> Tuple[str, str]:
134
- """Returns the aboslute private and public key paths."""
135
- private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path()
136
- private_key_path = os.path.expanduser(private_key_path)
137
- public_key_path = os.path.expanduser(public_key_path)
138
- lock_path = os.path.expanduser(lock_path)
139
-
140
- lock_dir = os.path.dirname(lock_path)
141
- # We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
142
- # as the ssh configs will be written to this folder as well in
143
- # backend_utils.SSHConfigHelper
144
- os.makedirs(lock_dir, exist_ok=True, mode=0o700)
145
- with filelock.FileLock(lock_path, timeout=10):
146
- if not os.path.exists(private_key_path):
147
- public_key, private_key = _generate_rsa_key_pair()
148
- _save_key_pair(private_key_path, public_key_path, private_key,
149
- public_key)
150
- assert os.path.exists(public_key_path), (
151
- 'Private key found, but associated public key '
152
- f'{public_key_path} does not exist.')
153
- return private_key_path, public_key_path
154
-
155
58
 
156
59
  def configure_ssh_info(config: Dict[str, Any]) -> Dict[str, Any]:
157
- _, public_key_path = get_or_generate_keys()
60
+ _, public_key_path = auth_utils.get_or_generate_keys()
158
61
  with open(public_key_path, 'r', encoding='utf-8') as f:
159
62
  public_key = f.read().strip()
160
- config_str = common_utils.dump_yaml_str(config)
63
+ config_str = yaml_utils.dump_yaml_str(config)
161
64
  config_str = config_str.replace('skypilot:ssh_user',
162
65
  config['auth']['ssh_user'])
163
66
  config_str = config_str.replace('skypilot:ssh_public_key_content',
164
67
  public_key)
165
- config = yaml.safe_load(config_str)
68
+ config = yaml_utils.safe_load(config_str)
166
69
  return config
167
70
 
168
71
 
72
+ def parse_gcp_project_oslogin(project):
73
+ """Helper function to parse GCP project metadata."""
74
+ common_metadata = project.get('commonInstanceMetadata', {})
75
+ if not isinstance(common_metadata, dict):
76
+ common_metadata = {}
77
+
78
+ metadata_items = common_metadata.get('items', [])
79
+ if not isinstance(metadata_items, list):
80
+ metadata_items = []
81
+
82
+ project_oslogin = next(
83
+ (item for item in metadata_items
84
+ if isinstance(item, dict) and item.get('key') == 'enable-oslogin'),
85
+ {}).get('value', 'False')
86
+
87
+ return project_oslogin
88
+
89
+
169
90
  # Snippets of code inspired from
170
91
  # https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/gcp/config.py
171
92
  # Takes in config, a yaml dict and outputs a postprocessed dict
@@ -174,7 +95,7 @@ def configure_ssh_info(config: Dict[str, Any]) -> Dict[str, Any]:
174
95
  # Retry for the GCP as sometimes there will be connection reset by peer error.
175
96
  @common_utils.retry
176
97
  def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
177
- _, public_key_path = get_or_generate_keys()
98
+ _, public_key_path = auth_utils.get_or_generate_keys()
178
99
  config = copy.deepcopy(config)
179
100
 
180
101
  project_id = config['provider']['project_id']
@@ -223,10 +144,7 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
223
144
  'Please check your network connection.')
224
145
  raise
225
146
 
226
- project_oslogin: str = next( # type: ignore
227
- (item for item in project['commonInstanceMetadata'].get('items', [])
228
- if item['key'] == 'enable-oslogin'), {}).get('value', 'False')
229
-
147
+ project_oslogin = parse_gcp_project_oslogin(project)
230
148
  if project_oslogin.lower() == 'true':
231
149
  logger.info(
232
150
  f'OS Login is enabled for GCP project {project_id}. Running '
@@ -242,7 +160,7 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
242
160
  os_login_username = None
243
161
  if proc.returncode == 0:
244
162
  try:
245
- profile = yaml.safe_load(proc.stdout)
163
+ profile = yaml_utils.safe_load(proc.stdout)
246
164
  username = profile['posixAccounts'][0]['username']
247
165
  if username:
248
166
  os_login_username = username
@@ -302,11 +220,11 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
302
220
 
303
221
  def setup_lambda_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
304
222
 
305
- get_or_generate_keys()
223
+ auth_utils.get_or_generate_keys()
306
224
 
307
225
  # Ensure ssh key is registered with Lambda Cloud
308
226
  lambda_client = lambda_utils.LambdaCloudClient()
309
- _, public_key_path = get_or_generate_keys()
227
+ _, public_key_path = auth_utils.get_or_generate_keys()
310
228
  with open(public_key_path, 'r', encoding='utf-8') as f:
311
229
  public_key = f.read().strip()
312
230
  prefix = f'sky-key-{common_utils.get_user_hash()}'
@@ -323,7 +241,7 @@ def setup_ibm_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
323
241
  and updates config file.
324
242
  keys default location: '~/.ssh/sky-key' and '~/.ssh/sky-key.pub'
325
243
  """
326
- private_key_path, _ = get_or_generate_keys()
244
+ private_key_path, _ = auth_utils.get_or_generate_keys()
327
245
 
328
246
  def _get_unique_key_name():
329
247
  suffix_len = 10
@@ -332,7 +250,7 @@ def setup_ibm_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
332
250
  client = ibm.client(region=config['provider']['region'])
333
251
  resource_group_id = config['provider']['resource_group_id']
334
252
 
335
- _, public_key_path = get_or_generate_keys()
253
+ _, public_key_path = auth_utils.get_or_generate_keys()
336
254
  with open(os.path.abspath(os.path.expanduser(public_key_path)),
337
255
  'r',
338
256
  encoding='utf-8') as file:
@@ -372,116 +290,31 @@ def setup_ibm_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
372
290
 
373
291
 
374
292
  def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
375
- # Default ssh session is established with kubectl port-forwarding with
376
- # ClusterIP service.
377
- nodeport_mode = kubernetes_enums.KubernetesNetworkingMode.NODEPORT
378
- port_forward_mode = kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD
379
- network_mode_str = skypilot_config.get_nested(('kubernetes', 'networking'),
380
- port_forward_mode.value)
381
- try:
382
- network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
383
- network_mode_str)
384
- except ValueError as e:
385
- # Add message saying "Please check: ~/.sky/config.yaml" to the error
386
- # message.
387
- with ux_utils.print_exception_no_traceback():
388
- raise ValueError(str(e) + ' Please check: ~/.sky/config.yaml.') \
389
- from None
390
- _, public_key_path = get_or_generate_keys()
391
-
392
- # Add the user's public key to the SkyPilot cluster.
393
- secret_name = clouds.Kubernetes.SKY_SSH_KEY_SECRET_NAME
394
- secret_field_name = clouds.Kubernetes().ssh_key_secret_field_name
395
- context = config['provider'].get(
396
- 'context', kubernetes_utils.get_current_kube_config_context_name())
397
- if context == kubernetes.in_cluster_context_name():
398
- # If the context is an in-cluster context name, we are running in a pod
399
- # with in-cluster configuration. We need to set the context to None
400
- # to use the mounted service account.
401
- context = None
293
+ context = kubernetes_utils.get_context_from_config(config['provider'])
402
294
  namespace = kubernetes_utils.get_namespace_from_config(config['provider'])
403
- k8s = kubernetes.kubernetes
404
- with open(public_key_path, 'r', encoding='utf-8') as f:
405
- public_key = f.read()
406
- if not public_key.endswith('\n'):
407
- public_key += '\n'
408
-
409
- # Generate metadata
410
- secret_metadata = {
411
- 'name': secret_name,
412
- 'labels': {
413
- 'parent': 'skypilot'
414
- }
415
- }
416
- custom_metadata = skypilot_config.get_nested(
417
- ('kubernetes', 'custom_metadata'), {})
418
- config_utils.merge_k8s_configs(secret_metadata, custom_metadata)
419
-
420
- secret = k8s.client.V1Secret(
421
- metadata=k8s.client.V1ObjectMeta(**secret_metadata),
422
- string_data={secret_field_name: public_key})
423
- try:
424
- if kubernetes_utils.check_secret_exists(secret_name, namespace,
425
- context):
426
- logger.debug(f'Key {secret_name} exists in the cluster, '
427
- 'patching it...')
428
- kubernetes.core_api(context).patch_namespaced_secret(
429
- secret_name, namespace, secret)
430
- else:
431
- logger.debug(f'Key {secret_name} does not exist in the cluster, '
432
- 'creating it...')
433
- kubernetes.core_api(context).create_namespaced_secret(
434
- namespace, secret)
435
- except kubernetes.api_exception() as e:
436
- if e.status == 409 and e.reason == 'AlreadyExists':
437
- logger.debug(f'Key {secret_name} was created concurrently, '
438
- 'patching it...')
439
- kubernetes.core_api(context).patch_namespaced_secret(
440
- secret_name, namespace, secret)
441
- else:
442
- raise e
443
-
444
- private_key_path, _ = get_or_generate_keys()
445
- if network_mode == nodeport_mode:
446
- ssh_jump_name = clouds.Kubernetes.SKY_SSH_JUMP_NAME
447
- service_type = kubernetes_enums.KubernetesServiceType.NODEPORT
448
- # Setup service for SSH jump pod. We create the SSH jump service here
449
- # because we need to know the service IP address and port to set the
450
- # ssh_proxy_command in the autoscaler config.
451
- kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace, context,
452
- service_type)
453
- ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
454
- ssh_jump_name,
455
- nodeport_mode,
456
- private_key_path=private_key_path,
457
- context=context,
458
- namespace=namespace)
459
- elif network_mode == port_forward_mode:
460
- # Using `kubectl port-forward` creates a direct tunnel to the pod and
461
- # does not require a ssh jump pod.
462
- kubernetes_utils.check_port_forward_mode_dependencies()
463
- # TODO(romilb): This can be further optimized. Instead of using the
464
- # head node as a jump pod for worker nodes, we can also directly
465
- # set the ssh_target to the worker node. However, that requires
466
- # changes in the downstream code to return a mapping of node IPs to
467
- # pod names (to be used as ssh_target) and updating the upstream
468
- # SSHConfigHelper to use a different ProxyCommand for each pod.
469
- # This optimization can reduce SSH time from ~0.35s to ~0.25s, tested
470
- # on GKE.
471
- ssh_target = config['cluster_name'] + '-head'
472
- ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
473
- ssh_target,
474
- port_forward_mode,
475
- private_key_path=private_key_path,
476
- context=context,
477
- namespace=namespace)
478
- else:
479
- # This should never happen because we check for this in from_str above.
480
- raise ValueError(f'Unsupported networking mode: {network_mode_str}')
295
+ private_key_path, _ = auth_utils.get_or_generate_keys()
296
+ # Using `kubectl port-forward` creates a direct tunnel to the pod and
297
+ # does not require a ssh jump pod.
298
+ kubernetes_utils.check_port_forward_mode_dependencies()
299
+ # TODO(romilb): This can be further optimized. Instead of using the
300
+ # head node as a jump pod for worker nodes, we can also directly
301
+ # set the ssh_target to the worker node. However, that requires
302
+ # changes in the downstream code to return a mapping of node IPs to
303
+ # pod names (to be used as ssh_target) and updating the upstream
304
+ # SSHConfigHelper to use a different ProxyCommand for each pod.
305
+ # This optimization can reduce SSH time from ~0.35s to ~0.25s, tested
306
+ # on GKE.
307
+ pod_name = config['cluster_name'] + '-head'
308
+ ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
309
+ pod_name,
310
+ private_key_path=private_key_path,
311
+ context=context,
312
+ namespace=namespace)
481
313
  config['auth']['ssh_proxy_command'] = ssh_proxy_cmd
482
314
  config['auth']['ssh_private_key'] = private_key_path
483
315
 
484
- return config
316
+ # Add the user's public key to the SkyPilot cluster.
317
+ return configure_ssh_info(config)
485
318
 
486
319
 
487
320
  # ---------------------------------- RunPod ---------------------------------- #
@@ -490,7 +323,7 @@ def setup_runpod_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
490
323
  - Generates a new SSH key pair if one does not exist.
491
324
  - Adds the public SSH key to the user's RunPod account.
492
325
  """
493
- _, public_key_path = get_or_generate_keys()
326
+ _, public_key_path = auth_utils.get_or_generate_keys()
494
327
  with open(public_key_path, 'r', encoding='UTF-8') as pub_key_file:
495
328
  public_key = pub_key_file.read().strip()
496
329
  runpod.runpod.cli.groups.ssh.functions.add_ssh_key(public_key)
@@ -503,7 +336,7 @@ def setup_vast_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
503
336
  - Generates a new SSH key pair if one does not exist.
504
337
  - Adds the public SSH key to the user's Vast account.
505
338
  """
506
- _, public_key_path = get_or_generate_keys()
339
+ _, public_key_path = auth_utils.get_or_generate_keys()
507
340
  with open(public_key_path, 'r', encoding='UTF-8') as pub_key_file:
508
341
  public_key = pub_key_file.read().strip()
509
342
  current_key_list = vast.vast().show_ssh_keys() # pylint: disable=assignment-from-no-return
@@ -517,7 +350,7 @@ def setup_vast_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
517
350
 
518
351
  def setup_fluidstack_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
519
352
 
520
- _, public_key_path = get_or_generate_keys()
353
+ _, public_key_path = auth_utils.get_or_generate_keys()
521
354
 
522
355
  client = fluidstack_utils.FluidstackClient()
523
356
  public_key = None
@@ -526,3 +359,124 @@ def setup_fluidstack_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
526
359
  client.get_or_add_ssh_key(public_key)
527
360
  config['auth']['ssh_public_key'] = public_key_path
528
361
  return configure_ssh_info(config)
362
+
363
+
364
+ def setup_hyperbolic_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
365
+ """Sets up SSH authentication for Hyperbolic."""
366
+ _, public_key_path = auth_utils.get_or_generate_keys()
367
+ with open(public_key_path, 'r', encoding='utf-8') as f:
368
+ public_key = f.read().strip()
369
+
370
+ # TODO: adjust below to use public_keys instead of
371
+ # public_key once backwards-compatibility is no longer required
372
+ config['publicKey'] = public_key
373
+
374
+ # Set up auth section for Ray template
375
+ config.setdefault('auth', {})
376
+ config['auth']['ssh_user'] = 'ubuntu'
377
+ config['auth']['ssh_public_key'] = public_key_path
378
+
379
+ return configure_ssh_info(config)
380
+
381
+
382
+ def setup_shadeform_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
383
+ """Sets up SSH authentication for Shadeform.
384
+ - Generates a new SSH key pair if one does not exist.
385
+ - Adds the public SSH key to the user's Shadeform account.
386
+
387
+ Note: This assumes there is a Shadeform Python SDK available.
388
+ If no official SDK exists, this function would need to use direct API calls.
389
+ """
390
+
391
+ _, public_key_path = auth_utils.get_or_generate_keys()
392
+ ssh_key_id = None
393
+
394
+ with open(public_key_path, 'r', encoding='utf-8') as f:
395
+ public_key = f.read().strip()
396
+
397
+ try:
398
+ # Add SSH key to Shadeform using our utility functions
399
+ ssh_key_id = shadeform_adaptor.add_ssh_key_to_shadeform(public_key)
400
+
401
+ except ImportError as e:
402
+ # If required dependencies are missing
403
+ logger.warning(
404
+ f'Failed to add Shadeform SSH key due to missing dependencies: '
405
+ f'{e}. Manually configure SSH keys in your Shadeform account.')
406
+
407
+ except Exception as e:
408
+ logger.warning(f'Failed to set up Shadeform authentication: {e}')
409
+ raise exceptions.CloudUserIdentityError(
410
+ 'Failed to set up SSH authentication for Shadeform. '
411
+ f'Please ensure your Shadeform credentials are configured: {e}'
412
+ ) from e
413
+
414
+ if ssh_key_id is None:
415
+ raise Exception('Failed to add SSH key to Shadeform')
416
+
417
+ # Configure SSH info in the config
418
+ config['auth']['ssh_public_key'] = public_key_path
419
+ config['auth']['ssh_key_id'] = ssh_key_id
420
+
421
+ return configure_ssh_info(config)
422
+
423
+
424
+ def setup_primeintellect_authentication(
425
+ config: Dict[str, Any]) -> Dict[str, Any]:
426
+ """Sets up SSH authentication for Prime Intellect.
427
+ - Generates a new SSH key pair if one does not exist.
428
+ - Adds the public SSH key to the user's Prime Intellect account.
429
+ """
430
+ # Ensure local SSH keypair exists and fetch public key content
431
+ _, public_key_path = auth_utils.get_or_generate_keys()
432
+ with open(public_key_path, 'r', encoding='utf-8') as f:
433
+ public_key = f.read().strip()
434
+
435
+ # Register the public key with Prime Intellect (no-op if already exists)
436
+ client = primeintellect_utils.PrimeIntellectAPIClient()
437
+ client.get_or_add_ssh_key(public_key)
438
+
439
+ # Set up auth section for Ray template
440
+ config.setdefault('auth', {})
441
+ # Default username for Prime Intellect images
442
+ config['auth']['ssh_user'] = 'ubuntu'
443
+ config['auth']['ssh_public_key'] = public_key_path
444
+
445
+ return configure_ssh_info(config)
446
+
447
+
448
+ def setup_seeweb_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
449
+ """Registers the public key with Seeweb and notes the remote name."""
450
+ # 1. local key pair
451
+ auth_utils.get_or_generate_keys()
452
+
453
+ # 2. public key
454
+ _, public_key_path = auth_utils.get_or_generate_keys()
455
+ with open(public_key_path, 'r', encoding='utf-8') as f:
456
+ public_key = f.read().strip()
457
+
458
+ # 3. Seeweb API client
459
+ client = seeweb_adaptor.client()
460
+
461
+ # 4. Check if key is already registered
462
+ prefix = f'sky-key-{common_utils.get_user_hash()}'
463
+ remote_name = None
464
+ for k in client.fetch_ssh_keys():
465
+ if k.key.strip() == public_key:
466
+ remote_name = k.label # already present
467
+ break
468
+
469
+ # 5. doesn't exist, choose a unique name and create it
470
+ if remote_name is None:
471
+ suffix = 1
472
+ remote_name = prefix
473
+ existing_names = {k.label for k in client.fetch_ssh_keys()}
474
+ while remote_name in existing_names:
475
+ suffix += 1
476
+ remote_name = f'{prefix}-{suffix}'
477
+ client.create_ssh_key(label=remote_name, key=public_key)
478
+
479
+ # 6. Put the remote name in cluster-config (like for Lambda)
480
+ config['auth']['remote_key_name'] = remote_name
481
+
482
+ return config
sky/backends/__init__.py CHANGED
@@ -3,11 +3,13 @@ from sky.backends.backend import Backend
3
3
  from sky.backends.backend import ResourceHandle
4
4
  from sky.backends.cloud_vm_ray_backend import CloudVmRayBackend
5
5
  from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle
6
+ from sky.backends.cloud_vm_ray_backend import LocalResourcesHandle
7
+ from sky.backends.cloud_vm_ray_backend import SkyletClient
6
8
  from sky.backends.local_docker_backend import LocalDockerBackend
7
9
  from sky.backends.local_docker_backend import LocalDockerResourceHandle
8
10
 
9
11
  __all__ = [
10
12
  'Backend', 'ResourceHandle', 'CloudVmRayBackend',
11
- 'CloudVmRayResourceHandle', 'LocalDockerBackend',
12
- 'LocalDockerResourceHandle'
13
+ 'CloudVmRayResourceHandle', 'SkyletClient', 'LocalResourcesHandle',
14
+ 'LocalDockerBackend', 'LocalDockerResourceHandle'
13
15
  ]
sky/backends/backend.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """Sky backend interface."""
2
2
  import typing
3
- from typing import Dict, Generic, Optional, Tuple
3
+ from typing import Any, Dict, Generic, Optional, Tuple, Union
4
4
 
5
5
  from sky.usage import usage_lib
6
6
  from sky.utils import cluster_utils
@@ -37,8 +37,9 @@ class Backend(Generic[_ResourceHandleType]):
37
37
  ResourceHandle = ResourceHandle # pylint: disable=invalid-name
38
38
 
39
39
  # --- APIs ---
40
- def check_resources_fit_cluster(self, handle: _ResourceHandleType,
41
- task: 'task_lib.Task') -> None:
40
+ def check_resources_fit_cluster(
41
+ self, handle: _ResourceHandleType,
42
+ task: 'task_lib.Task') -> Optional['resources.Resources']:
42
43
  """Check whether resources of the task are satisfied by cluster."""
43
44
  raise NotImplementedError
44
45
 
@@ -89,8 +90,16 @@ class Backend(Generic[_ResourceHandleType]):
89
90
 
90
91
  @timeline.event
91
92
  @usage_lib.messages.usage.update_runtime('sync_workdir')
92
- def sync_workdir(self, handle: _ResourceHandleType, workdir: Path) -> None:
93
- return self._sync_workdir(handle, workdir)
93
+ def sync_workdir(self, handle: _ResourceHandleType,
94
+ workdir: Union[Path, Dict[str, Any]],
95
+ envs_and_secrets: Dict[str, str]) -> None:
96
+ return self._sync_workdir(handle, workdir, envs_and_secrets)
97
+
98
+ @timeline.event
99
+ @usage_lib.messages.usage.update_runtime('download_file')
100
+ def download_file(self, handle: _ResourceHandleType, local_file_path: str,
101
+ remote_file_path: str) -> None:
102
+ return self._download_file(handle, local_file_path, remote_file_path)
94
103
 
95
104
  @timeline.event
96
105
  @usage_lib.messages.usage.update_runtime('sync_file_mounts')
@@ -117,7 +126,6 @@ class Backend(Generic[_ResourceHandleType]):
117
126
  def execute(self,
118
127
  handle: _ResourceHandleType,
119
128
  task: 'task_lib.Task',
120
- detach_run: bool,
121
129
  dryrun: bool = False) -> Optional[int]:
122
130
  """Execute the task on the cluster.
123
131
 
@@ -128,7 +136,7 @@ class Backend(Generic[_ResourceHandleType]):
128
136
  handle.get_cluster_name())
129
137
  usage_lib.messages.usage.update_actual_task(task)
130
138
  with rich_utils.safe_status(ux_utils.spinner_message('Submitting job')):
131
- return self._execute(handle, task, detach_run, dryrun)
139
+ return self._execute(handle, task, dryrun)
132
140
 
133
141
  @timeline.event
134
142
  def post_execute(self, handle: _ResourceHandleType, down: bool) -> None:
@@ -164,7 +172,13 @@ class Backend(Generic[_ResourceHandleType]):
164
172
  ) -> Tuple[Optional[_ResourceHandleType], bool]:
165
173
  raise NotImplementedError
166
174
 
167
- def _sync_workdir(self, handle: _ResourceHandleType, workdir: Path) -> None:
175
+ def _sync_workdir(self, handle: _ResourceHandleType,
176
+ workdir: Union[Path, Dict[str, Any]],
177
+ envs_and_secrets: Dict[str, str]) -> None:
178
+ raise NotImplementedError
179
+
180
+ def _download_file(self, handle: _ResourceHandleType, local_file_path: str,
181
+ remote_file_path: str) -> None:
168
182
  raise NotImplementedError
169
183
 
170
184
  def _sync_file_mounts(
@@ -182,7 +196,6 @@ class Backend(Generic[_ResourceHandleType]):
182
196
  def _execute(self,
183
197
  handle: _ResourceHandleType,
184
198
  task: 'task_lib.Task',
185
- detach_run: bool,
186
199
  dryrun: bool = False) -> Optional[int]:
187
200
  raise NotImplementedError
188
201