skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,400 @@
1
+ """ Shadeform Cloud. """
2
+
3
+ import json
4
+ import os
5
+ import typing
6
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
7
+
8
+ from sky import catalog
9
+ from sky import clouds
10
+ from sky.adaptors import common as adaptors_common
11
+ from sky.catalog import shadeform_catalog
12
+ from sky.utils import registry
13
+ from sky.utils import resources_utils
14
+ from sky.utils import status_lib
15
+
16
+ if typing.TYPE_CHECKING:
17
+ from sky import resources as resources_lib
18
+ from sky.utils import volume as volume_lib
19
+ else:
20
+ requests = adaptors_common.LazyImport('requests')
21
+
22
+ # Minimum set of files under ~/.shadeform that grant Shadeform access.
23
+ _CREDENTIAL_FILES = [
24
+ 'api_key',
25
+ ]
26
+
27
+
28
+ @registry.CLOUD_REGISTRY.register
29
+ class Shadeform(clouds.Cloud):
30
+ """Shadeform GPU Cloud
31
+
32
+ Shadeform is a unified API for deploying and managing cloud GPUs across
33
+ multiple cloud providers.
34
+ """
35
+
36
+ # Shadeform doesn't have explicit cluster name limits, but conservative
37
+ _MAX_CLUSTER_NAME_LEN_LIMIT = 120
38
+
39
+ # Features not currently supported by Shadeform
40
+ # yapf: disable
41
+ _CLOUD_UNSUPPORTED_FEATURES = {
42
+ clouds.CloudImplementationFeatures.STOP:
43
+ 'Stopping instances not supported on Shadeform.',
44
+ clouds.CloudImplementationFeatures.MULTI_NODE:
45
+ 'Multi-node clusters not supported on Shadeform.',
46
+ clouds.CloudImplementationFeatures.SPOT_INSTANCE:
47
+ 'Spot instances not supported on Shadeform.',
48
+ clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
49
+ 'Custom disk tiers not supported on Shadeform.',
50
+ clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
51
+ 'Custom network tiers not supported on Shadeform.',
52
+ clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
53
+ 'Object storage mounting not supported on Shadeform.',
54
+ clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
55
+ 'Host controllers not supported on Shadeform.',
56
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
57
+ 'High availability controllers not supported.',
58
+ clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
59
+ 'Disk cloning not supported on Shadeform.',
60
+ clouds.CloudImplementationFeatures.IMAGE_ID:
61
+ 'Custom image IDs not supported on Shadeform.',
62
+ clouds.CloudImplementationFeatures.DOCKER_IMAGE:
63
+ 'Docker images not supported on Shadeform yet.',
64
+ clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
65
+ 'Custom multiple network interfaces not supported.',
66
+ }
67
+ # yapf: enable
68
+
69
+ _regions: List[clouds.Region] = []
70
+
71
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
72
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
73
+ OPEN_PORTS_VERSION = clouds.OpenPortsVersion.LAUNCH_ONLY
74
+
75
+ @classmethod
76
+ def _unsupported_features_for_resources(
77
+ cls,
78
+ resources: 'resources_lib.Resources',
79
+ region: Optional[str] = None,
80
+ ) -> Dict[clouds.CloudImplementationFeatures, str]:
81
+ """The features not supported based on the resources provided."""
82
+ del resources # unused
83
+ return cls._CLOUD_UNSUPPORTED_FEATURES
84
+
85
+ @classmethod
86
+ def _max_cluster_name_length(cls) -> Optional[int]:
87
+ return cls._MAX_CLUSTER_NAME_LEN_LIMIT
88
+
89
+ @classmethod
90
+ def regions_with_offering(
91
+ cls,
92
+ instance_type: str,
93
+ accelerators: Optional[Dict[str, int]],
94
+ use_spot: bool,
95
+ region: Optional[str],
96
+ zone: Optional[str],
97
+ resources: Optional['resources_lib.Resources'] = None,
98
+ ) -> List[clouds.Region]:
99
+ """Get regions that offer the requested instance type."""
100
+ assert zone is None, 'Shadeform does not support zones.'
101
+ del zone # unused
102
+ if use_spot:
103
+ return [] # No spot support
104
+
105
+ # IMPORTANT: instance_type here is the specific Shadeform instance type
106
+ # (like 'massedcompute_A6000_base'), NOT the accelerator name
107
+ # We only return regions where this exact instance type exists
108
+ regions = shadeform_catalog.get_region_zones_for_instance_type(
109
+ instance_type, use_spot)
110
+
111
+ if region is not None:
112
+ regions = [r for r in regions if r.name == region]
113
+ return regions
114
+
115
+ @classmethod
116
+ def zones_provision_loop(
117
+ cls,
118
+ *,
119
+ region: str,
120
+ num_nodes: int,
121
+ instance_type: str,
122
+ accelerators: Optional[Dict[str, int]] = None,
123
+ use_spot: bool = False,
124
+ ) -> Iterator[None]:
125
+ """Iterate over zones for provisioning."""
126
+ del num_nodes # unused
127
+ if use_spot:
128
+ return
129
+
130
+ regions = cls.regions_with_offering(instance_type, accelerators,
131
+ use_spot, region, None)
132
+ for r in regions:
133
+ assert r.zones is None, r
134
+ yield r.zones
135
+
136
+ @classmethod
137
+ def get_vcpus_mem_from_instance_type(
138
+ cls,
139
+ instance_type: str,
140
+ ) -> Tuple[Optional[float], Optional[float]]:
141
+ """Get vCPUs and memory from instance type."""
142
+ return catalog.get_vcpus_mem_from_instance_type(instance_type,
143
+ clouds='shadeform')
144
+
145
+ @classmethod
146
+ def get_accelerators_from_instance_type(
147
+ cls,
148
+ instance_type: str,
149
+ ) -> Optional[Dict[str, Union[int, float]]]:
150
+ """Get accelerator information from instance type."""
151
+ return catalog.get_accelerators_from_instance_type(instance_type,
152
+ clouds='shadeform')
153
+
154
+ @classmethod
155
+ def get_default_instance_type(
156
+ cls,
157
+ cpus: Optional[str] = None,
158
+ memory: Optional[str] = None,
159
+ disk_tier: Optional[resources_utils.DiskTier] = None,
160
+ region: Optional[str] = None,
161
+ zone: Optional[str] = None,
162
+ ) -> Optional[str]:
163
+ """Get default instance type."""
164
+ del disk_tier # Not supported
165
+ return catalog.get_default_instance_type(cpus=cpus,
166
+ memory=memory,
167
+ disk_tier=None,
168
+ region=region,
169
+ zone=zone,
170
+ clouds='shadeform')
171
+
172
+ @classmethod
173
+ def get_zone_shell_cmd(cls) -> Optional[str]:
174
+ """Return shell command to get the zone of the instance."""
175
+ return None
176
+
177
+ @classmethod
178
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
179
+ """Get user identities for Shadeform."""
180
+ # No user identity support needed
181
+ return None
182
+
183
+ def instance_type_exists(self, instance_type: str) -> bool:
184
+ return catalog.instance_type_exists(instance_type, 'shadeform')
185
+
186
+ def instance_type_to_hourly_cost(self,
187
+ instance_type: str,
188
+ use_spot: bool,
189
+ region: Optional[str] = None,
190
+ zone: Optional[str] = None) -> float:
191
+ """Get hourly cost for instance type."""
192
+ if use_spot:
193
+ raise ValueError('Spot instances are not supported on Shadeform')
194
+ return catalog.get_hourly_cost(instance_type,
195
+ use_spot=use_spot,
196
+ region=region,
197
+ zone=zone,
198
+ clouds='shadeform')
199
+
200
+ def accelerators_to_hourly_cost(self,
201
+ accelerators: Dict[str, int],
202
+ use_spot: bool,
203
+ region: Optional[str] = None,
204
+ zone: Optional[str] = None) -> float:
205
+ """Get hourly cost for accelerators."""
206
+ return 0.0
207
+
208
+ def get_egress_cost(self, num_gigabytes: float) -> float:
209
+ """Get egress cost."""
210
+ # No explicit egress pricing from Shadeform API
211
+ return 0.0
212
+
213
+ def __repr__(self):
214
+ return 'Shadeform'
215
+
216
+ @classmethod
217
+ def get_current_user_identity(cls) -> Optional[str]:
218
+ """Get current user identity."""
219
+ return None
220
+
221
+ def make_deploy_resources_variables(
222
+ self,
223
+ resources: 'resources_lib.Resources',
224
+ cluster_name: resources_utils.ClusterName,
225
+ region: 'clouds.Region',
226
+ zones: Optional[List['clouds.Zone']],
227
+ num_nodes: int,
228
+ dryrun: bool = False,
229
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
230
+ ) -> Dict[str, Any]:
231
+ """Make variables for deployment template."""
232
+ del zones, num_nodes, dryrun, volume_mounts # unused for Shadeform
233
+
234
+ # Get instance type
235
+ r = resources.copy(accelerators=None)
236
+ feasible_resources = self._get_feasible_launchable_resources(r)
237
+ instance_type = feasible_resources.resources_list[0].instance_type
238
+
239
+ resources_vars = {}
240
+ if instance_type is not None:
241
+ instance_type_split = instance_type.split('_')
242
+ cloud = instance_type_split[0]
243
+ resources_vars.update({
244
+ 'instance_type': instance_type,
245
+ 'region': region.name,
246
+ 'cloud': cloud,
247
+ })
248
+
249
+ # Add accelerator resources for Ray
250
+ accelerators = resources.accelerators
251
+ if accelerators is not None:
252
+ resources_vars['custom_resources'] = json.dumps(accelerators,
253
+ separators=(',',
254
+ ':'))
255
+
256
+ return resources_vars
257
+
258
+ def get_credential_file_mounts(self) -> Dict[str, str]:
259
+ """Get credential files that need to be mounted."""
260
+ return {
261
+ f'~/.shadeform/{f}': f'~/.shadeform/{f}' for f in _CREDENTIAL_FILES
262
+ }
263
+
264
+ @classmethod
265
+ def get_current_user_identity_str(cls) -> Optional[str]:
266
+ """Get current user identity string."""
267
+ return None
268
+
269
+ @classmethod
270
+ def check_credentials(
271
+ cls, cloud_capability: clouds.CloudCapability
272
+ ) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
273
+ """Check if Shadeform credentials are properly configured."""
274
+ del cloud_capability # unused for Shadeform
275
+ try:
276
+ api_key_path = os.path.expanduser('~/.shadeform/api_key')
277
+ if not os.path.exists(api_key_path):
278
+ return False, (f'Shadeform API key not found. '
279
+ f'Please save your API key to {api_key_path}')
280
+
281
+ # Try to read the API key
282
+ with open(api_key_path, 'r', encoding='utf-8') as f:
283
+ api_key = f.read().strip()
284
+
285
+ if not api_key:
286
+ return False, f'Shadeform API key is empty in {api_key_path}'
287
+
288
+ return True, None
289
+
290
+ except (OSError, IOError) as e:
291
+ return False, f'Error checking Shadeform credentials: {str(e)}'
292
+
293
+ def _get_feasible_launchable_resources(
294
+ self, resources: 'resources_lib.Resources'
295
+ ) -> 'resources_utils.FeasibleResources':
296
+ """Get feasible launchable resources."""
297
+ if resources.use_spot:
298
+ return resources_utils.FeasibleResources(
299
+ [], [], 'Spot instances are not supported on Shadeform.')
300
+
301
+ if resources.instance_type is not None:
302
+ # Instance type is already specified, validate it
303
+ assert resources.is_launchable(), resources
304
+ fuzzy_candidate_list = [resources.instance_type]
305
+ return resources_utils.FeasibleResources([resources],
306
+ fuzzy_candidate_list, None)
307
+
308
+ # Map accelerators to instance types
309
+ def _make_resources(instance_type_list):
310
+ resource_list = []
311
+ for instance_type in instance_type_list:
312
+ r = resources.copy(
313
+ cloud=Shadeform(),
314
+ instance_type=instance_type,
315
+ accelerators=resources.
316
+ accelerators, # Keep original accelerators
317
+ cpus=None,
318
+ memory=None,
319
+ )
320
+ resource_list.append(r)
321
+ return resource_list
322
+
323
+ # Handle accelerator requests
324
+ accelerators = resources.accelerators
325
+ if accelerators is not None:
326
+ # Get the first accelerator type and count
327
+ for accelerator_name, accelerator_count in accelerators.items():
328
+ # Get instance types that provide this accelerator
329
+ func = shadeform_catalog.get_instance_type_for_accelerator
330
+ instance_types, errors = func(accelerator_name,
331
+ accelerator_count,
332
+ use_spot=resources.use_spot)
333
+
334
+ if instance_types:
335
+ # Create separate resource objects for each instance type
336
+ # This is crucial: each resource will only be considered
337
+ # for regions where its specific instance type is available
338
+ all_resources = []
339
+ all_candidate_names = []
340
+
341
+ # Create one resource per instance type
342
+ for instance_type in instance_types:
343
+ resource = resources.copy(
344
+ cloud=Shadeform(),
345
+ instance_type=instance_type,
346
+ accelerators=resources.accelerators,
347
+ cpus=None,
348
+ memory=None,
349
+ )
350
+ all_resources.append(resource)
351
+ all_candidate_names.append(instance_type)
352
+
353
+ return resources_utils.FeasibleResources(
354
+ all_resources, all_candidate_names, None)
355
+ else:
356
+ error_msg = (f'No instances available for accelerator '
357
+ f'{accelerator_name}')
358
+ if errors:
359
+ error_msg += f': {"; ".join(errors)}'
360
+ return resources_utils.FeasibleResources([], [], error_msg)
361
+
362
+ # If accelerator not found in mapping, return error
363
+ return resources_utils.FeasibleResources(
364
+ [], [],
365
+ f'Accelerator {list(accelerators.keys())[0]} not supported.')
366
+
367
+ # No accelerators specified, return a default instance type
368
+ if accelerators is None:
369
+ # Return a default instance type
370
+ default_instance_type = Shadeform.get_default_instance_type(
371
+ cpus=resources.cpus,
372
+ memory=resources.memory,
373
+ disk_tier=resources.disk_tier,
374
+ region=resources.region,
375
+ zone=resources.zone)
376
+ if default_instance_type is None:
377
+ # TODO: Add hints to all return values in this method to help
378
+ # users understand why the resources are not launchable.
379
+ return resources_utils.FeasibleResources([], [], None)
380
+ else:
381
+ return resources_utils.FeasibleResources(
382
+ _make_resources([default_instance_type]), [], None)
383
+
384
+ @classmethod
385
+ def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
386
+ """Check compute credentials."""
387
+ success, msg = cls.check_credentials(clouds.CloudCapability.COMPUTE)
388
+ # Convert return type to match expected signature
389
+ if isinstance(msg, dict):
390
+ msg = str(msg)
391
+ return success, msg
392
+
393
+ @classmethod
394
+ def query_status(cls, name: str, tag_filters: Dict[str, str],
395
+ region: Optional[str], zone: Optional[str],
396
+ **kwargs) -> List[status_lib.ClusterStatus]:
397
+ """Query cluster status."""
398
+ # For validation purposes, return empty list (no existing clusters)
399
+ # Actual status querying is handled by the provisioner
400
+ return []
sky/clouds/ssh.py ADDED
@@ -0,0 +1,263 @@
1
+ """SSH Node Pools"""
2
+
3
+ import os
4
+ import typing
5
+ from typing import Dict, List, Optional, Set, Tuple, Union
6
+
7
+ from sky import sky_logging
8
+ from sky import skypilot_config
9
+ from sky.adaptors import kubernetes as kubernetes_adaptor
10
+ from sky.clouds import kubernetes
11
+ from sky.provision.kubernetes import utils as kubernetes_utils
12
+ from sky.utils import annotations
13
+ from sky.utils import common_utils
14
+ from sky.utils import registry
15
+ from sky.utils import yaml_utils
16
+
17
+ if typing.TYPE_CHECKING:
18
+ # Renaming to avoid shadowing variables.
19
+ from sky import resources as resources_lib
20
+
21
+ logger = sky_logging.init_logger(__name__)
22
+
23
+ SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
24
+
25
+
26
+ @registry.CLOUD_REGISTRY.register()
27
+ class SSH(kubernetes.Kubernetes):
28
+ """SSH cloud implementation.
29
+
30
+ This is used by SSH Node Pools in SkyPilot, which use Kubernetes to manage
31
+ the SSH clusters.
32
+
33
+ This cloud is a thin wrapper around Kubernetes that only uses contexts
34
+ starting with 'ssh-', which are managed through `sky ssh up` command.
35
+ """
36
+
37
+ _REPR = 'SSH'
38
+
39
+ # Keep track of contexts that have been logged as unreachable
40
+ logged_unreachable_contexts: Set[str] = set()
41
+
42
+ def __repr__(self):
43
+ return self._REPR
44
+
45
+ @classmethod
46
+ def _unsupported_features_for_resources(
47
+ cls,
48
+ resources: 'resources_lib.Resources',
49
+ region: Optional[str] = None,
50
+ ) -> Dict[kubernetes.clouds.CloudImplementationFeatures, str]:
51
+ # Inherit all Kubernetes unsupported features
52
+ return super()._unsupported_features_for_resources(resources, region)
53
+
54
+ @classmethod
55
+ def get_ssh_node_pool_contexts(cls) -> List[str]:
56
+ """Get context names from ssh_node_pools.yaml file.
57
+
58
+ Reads the SSH node pools configuration file and returns
59
+ a list of context names by prepending 'ssh-' to each Node Pool name.
60
+
61
+ Returns:
62
+ A list of SSH Kubernetes context names derived from the Node Pools
63
+ in the SSH node pools file.
64
+ """
65
+ contexts = []
66
+
67
+ if os.path.exists(SSH_NODE_POOLS_PATH):
68
+ try:
69
+ with open(SSH_NODE_POOLS_PATH, 'r', encoding='utf-8') as f:
70
+ ssh_config = yaml_utils.safe_load(f)
71
+ if ssh_config:
72
+ # Get cluster names and prepend 'ssh-' to match
73
+ # context naming convention
74
+ contexts = [
75
+ f'ssh-{cluster_name}'
76
+ for cluster_name in ssh_config.keys()
77
+ ]
78
+ except Exception: # pylint: disable=broad-except
79
+ # If there's an error reading the file, return empty list
80
+ pass
81
+
82
+ return contexts
83
+
84
+ def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
85
+ if region == kubernetes_adaptor.in_cluster_context_name():
86
+ # If running incluster, we set region to IN_CLUSTER_REGION
87
+ # since there is no context name available.
88
+ return region, zone
89
+
90
+ all_contexts = self.existing_allowed_contexts()
91
+
92
+ if region is not None and region not in all_contexts:
93
+ region_name = common_utils.removeprefix(region, 'ssh-')
94
+ available_contexts = [
95
+ common_utils.removeprefix(c, 'ssh-') for c in all_contexts
96
+ ]
97
+ err_str = (f'SSH Node Pool {region_name!r} is not set up. '
98
+ 'Run `sky check` for more details. ')
99
+ if available_contexts:
100
+ err_str += f'Available node pools: {available_contexts}'
101
+ raise ValueError(err_str)
102
+ if zone is not None:
103
+ raise ValueError('SSH Node Pools do not support setting zone.')
104
+ return region, zone
105
+
106
+ @classmethod
107
+ @annotations.lru_cache(scope='global', maxsize=1)
108
+ def _ssh_log_skipped_contexts_once(
109
+ cls, skipped_contexts: Tuple[str, ...]) -> None:
110
+ """Log skipped contexts for only once.
111
+
112
+ We don't directly cache the result of _filter_existing_allowed_contexts
113
+ as the admin policy may update the allowed contexts.
114
+ """
115
+ if skipped_contexts:
116
+ count = len(set(skipped_contexts))
117
+ is_singular = count == 1
118
+ logger.warning(
119
+ f'SSH Node {("Pool" if is_singular else "Pools")} '
120
+ f'{set(skipped_contexts)!r} specified in '
121
+ f'{SSH_NODE_POOLS_PATH} {("has" if is_singular else "have")} '
122
+ 'not been set up. Skipping '
123
+ f'{("that pool" if is_singular else "those pools")}. '
124
+ 'Run `sky ssh up` to set up.')
125
+
126
+ @classmethod
127
+ def existing_allowed_contexts(cls, silent: bool = False) -> List[str]:
128
+ """Get existing allowed contexts that start with 'ssh-'.
129
+
130
+ Override the Kubernetes implementation to only return contexts that
131
+ start with 'ssh-', which are created by `sky ssh up`.
132
+
133
+ Returns contexts based on clusters defined in ~/.sky/ssh_node_pools.yaml
134
+ """
135
+ # Get all contexts from the Kubernetes implementation
136
+ all_contexts = kubernetes_utils.get_all_kube_context_names()
137
+ if not all_contexts:
138
+ return []
139
+
140
+ all_contexts = set(all_contexts)
141
+
142
+ # Workspace-level allowed_node_pools should take precedence over
143
+ # the global allowed_node_pools.
144
+ allowed_node_pools = skypilot_config.get_workspace_cloud('ssh').get(
145
+ 'allowed_node_pools', None)
146
+ if allowed_node_pools is None:
147
+ allowed_node_pools = skypilot_config.get_effective_region_config(
148
+ cloud='ssh',
149
+ region=None,
150
+ keys=('allowed_node_pools',),
151
+ default_value=None)
152
+
153
+ # Filter for SSH contexts (those starting with 'ssh-')
154
+ ssh_contexts = [
155
+ context for context in all_contexts if context.startswith('ssh-')
156
+ ]
157
+
158
+ # Get contexts from SSH node pools file
159
+ all_node_pool_contexts = cls.get_ssh_node_pool_contexts()
160
+
161
+ def filter_by_allowed_node_pools(ctxs):
162
+ if allowed_node_pools is None:
163
+ return ctxs
164
+ return [
165
+ ctx for ctx in ctxs
166
+ if common_utils.removeprefix(ctx, 'ssh-') in allowed_node_pools
167
+ ]
168
+
169
+ if all_node_pool_contexts:
170
+ # Only include allowed contexts that exist
171
+ existing_contexts = []
172
+ skipped_contexts = []
173
+ for context in all_node_pool_contexts:
174
+ if context in ssh_contexts:
175
+ existing_contexts.append(context)
176
+ else:
177
+ skipped_contexts.append(context)
178
+ if not silent:
179
+ cls._ssh_log_skipped_contexts_once(tuple(skipped_contexts))
180
+ return filter_by_allowed_node_pools(existing_contexts)
181
+
182
+ # If no all_node_pool_contexts found, return all SSH contexts
183
+ return filter_by_allowed_node_pools(ssh_contexts)
184
+
185
+ @classmethod
186
+ def _check_compute_credentials(
187
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
188
+ """Check if the user has access credentials to SSH contexts."""
189
+ # Check for port forward dependencies - reuse Kubernetes implementation
190
+ reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
191
+ if reasons is not None:
192
+ formatted = '\n'.join(
193
+ [reasons[0]] +
194
+ [f'{cls._INDENT_PREFIX}' + r for r in reasons[1:]])
195
+ return (False, formatted)
196
+
197
+ # Get SSH contexts
198
+ try:
199
+ existing_allowed_contexts = cls.existing_allowed_contexts()
200
+ except Exception as e: # pylint: disable=broad-except
201
+ return (False, f'Failed to get SSH contexts: {str(e)}')
202
+
203
+ if not existing_allowed_contexts:
204
+ return (False,
205
+ 'No SSH Node Pools are up. Run `sky ssh up` to set up '
206
+ f'Node Pools from {SSH_NODE_POOLS_PATH}.')
207
+
208
+ # Check credentials for each context
209
+ ctx2text = {}
210
+ success = False
211
+ for context in existing_allowed_contexts:
212
+ suc, text = super()._check_single_context(context)
213
+ success = success or suc
214
+ ctx2text[context] = text
215
+
216
+ return success, ctx2text
217
+
218
+ @classmethod
219
+ def check_single_context(cls, context: str) -> Tuple[bool, str]:
220
+ """Checks if the context is valid and accessible."""
221
+ reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
222
+ if reasons is not None:
223
+ formatted = '\n'.join(
224
+ [reasons[0]] +
225
+ [f'{cls._INDENT_PREFIX}' + r for r in reasons[1:]])
226
+ return (False, formatted)
227
+
228
+ # Add ssh- prefix to the context
229
+ if not context.startswith('ssh-'):
230
+ context = f'ssh-{context}'
231
+
232
+ # Get SSH contexts
233
+ try:
234
+ existing_allowed_contexts = cls.existing_allowed_contexts()
235
+ except Exception as e: # pylint: disable=broad-except
236
+ return (False, f'Failed to get SSH contexts: {str(e)}')
237
+
238
+ if not existing_allowed_contexts:
239
+ return (False,
240
+ 'No SSH Node Pools are up. Run `sky ssh up` to set up '
241
+ f'Node Pools from {SSH_NODE_POOLS_PATH}.')
242
+
243
+ if context not in existing_allowed_contexts:
244
+ return (False, f'SSH Node Pool {context} is not set up. '
245
+ f'Run `sky ssh up --infra {context}` to set up.')
246
+
247
+ # Check if the context is valid
248
+ suc, text = super()._check_single_context(context)
249
+ if not suc:
250
+ return (False, text)
251
+
252
+ return (True, 'SSH Node Pool is set up.')
253
+
254
+ @classmethod
255
+ def expand_infras(cls) -> List[str]:
256
+ return [
257
+ f'{cls.canonical_name()}/{c.lstrip("ssh-")}'
258
+ for c in cls.existing_allowed_contexts(silent=True)
259
+ ]
260
+
261
+ @classmethod
262
+ def display_name(cls) -> str:
263
+ return 'SSH Node Pools'
@@ -28,10 +28,16 @@ class AWSReservation:
28
28
 
29
29
 
30
30
  def use_reservations() -> bool:
31
- prioritize_reservations = skypilot_config.get_nested(
32
- ('aws', 'prioritize_reservations'), False)
33
- specific_reservations = skypilot_config.get_nested(
34
- ('aws', 'specific_reservations'), set())
31
+ prioritize_reservations = skypilot_config.get_effective_region_config(
32
+ cloud='aws',
33
+ region=None,
34
+ keys=('prioritize_reservations',),
35
+ default_value=False)
36
+ specific_reservations = skypilot_config.get_effective_region_config(
37
+ cloud='aws',
38
+ region=None,
39
+ keys=('specific_reservations',),
40
+ default_value=set())
35
41
  return prioritize_reservations or specific_reservations
36
42
 
37
43