skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,437 @@
1
+ """Hyperbolic instance provisioning."""
2
+ import time
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ from sky import sky_logging
6
+ from sky.provision import common
7
+ from sky.provision.hyperbolic import utils
8
+ from sky.utils import status_lib
9
+
10
+ PROVIDER_NAME = 'hyperbolic'
11
+ POLL_INTERVAL = 5
12
+ QUERY_PORTS_TIMEOUT_SECONDS = 30
13
+ #TODO come up with a reasonable value for this timeout
14
+ TIMEOUT = 300
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+
19
+ def _filter_instances(cluster_name_on_cloud: str,
20
+ status_filters: Optional[List[str]],
21
+ head_only: bool = False) -> Dict[str, Dict[str, Any]]:
22
+ logger.debug(f'Filtering instances: cluster={cluster_name_on_cloud}, '
23
+ f'status={status_filters}')
24
+ _ = head_only # Mark as intentionally unused
25
+
26
+ # Filter by cluster name using metadata
27
+ instances = utils.list_instances(
28
+ metadata={'skypilot': {
29
+ 'cluster_name': cluster_name_on_cloud
30
+ }})
31
+
32
+ # Normalize status filters to lowercase
33
+ if status_filters is not None:
34
+ status_filters = [s.lower() for s in status_filters]
35
+
36
+ filtered_instances: Dict[str, Dict[str, Any]] = {}
37
+ for instance_id, instance in instances.items():
38
+ try:
39
+ # Check status filter
40
+ instance_status = instance.get('status', '').lower()
41
+ if (status_filters is not None and
42
+ instance_status not in status_filters):
43
+ logger.debug(
44
+ f'Skipping instance {instance_id} '
45
+ f'- status {instance_status} not in {status_filters}')
46
+ continue
47
+
48
+ filtered_instances[instance_id] = instance
49
+ logger.debug(f'Including instance {instance_id} '
50
+ f'with status {instance_status}')
51
+
52
+ except Exception as e: # pylint: disable=broad-except
53
+ logger.warning(f'Error processing instance {instance_id}: {str(e)}')
54
+ continue
55
+
56
+ logger.info(f'Found {len(filtered_instances)} instances matching filters')
57
+ return filtered_instances
58
+
59
+
60
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
61
+ """Get the instance ID from the instances dict."""
62
+ if not instances:
63
+ return None
64
+ return next(iter(instances.keys()))
65
+
66
+
67
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
68
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
69
+ del cluster_name # unused
70
+ logger.info(f'Starting run_instances with region={region}, '
71
+ f'cluster={cluster_name_on_cloud}')
72
+ logger.debug(f'Config: {config}')
73
+ start_time = time.time()
74
+
75
+ # Define pending statuses for Hyperbolic
76
+ pending_status = [
77
+ utils.HyperbolicInstanceStatus.CREATING.value,
78
+ utils.HyperbolicInstanceStatus.STARTING.value
79
+ ]
80
+ logger.debug(
81
+ f'Looking for instances with pending statuses: {pending_status}')
82
+
83
+ # Wait for any pending instances to be ready
84
+ while True:
85
+ if time.time() - start_time > TIMEOUT:
86
+ logger.error(
87
+ f'Timed out after {TIMEOUT}s waiting for instances to be ready')
88
+ raise TimeoutError(
89
+ f'Timed out after {TIMEOUT}s waiting for instances to be ready')
90
+
91
+ instances = _filter_instances(cluster_name_on_cloud, pending_status)
92
+ logger.debug(f'Found {len(instances)} instances with pending status')
93
+ if not instances:
94
+ break
95
+ logger.info(
96
+ f'Waiting for instance to be ready. Current instances: {instances}')
97
+ time.sleep(POLL_INTERVAL)
98
+
99
+ # Check existing running instance
100
+ logger.info('Checking for existing running instances')
101
+ exist_instances = _filter_instances(
102
+ cluster_name_on_cloud, [utils.HyperbolicInstanceStatus.ONLINE.value])
103
+ logger.debug(
104
+ f'Found {len(exist_instances)} running instances: {exist_instances}')
105
+ instance_id = _get_head_instance_id(exist_instances)
106
+ logger.debug(f'Head instance ID: {instance_id}')
107
+
108
+ # Calculate if we need to start a new instance
109
+ to_start_count = 1 - len(exist_instances) # Always 1 for single node
110
+ logger.info(f'Need to start {to_start_count} new instances')
111
+ if to_start_count < 0:
112
+ logger.error(
113
+ f'Cluster {cluster_name_on_cloud} already has an instance running')
114
+ raise RuntimeError(
115
+ f'Cluster {cluster_name_on_cloud} already has an instance running.')
116
+ if to_start_count == 0:
117
+ if instance_id is None:
118
+ logger.error(
119
+ f'Cluster {cluster_name_on_cloud} has no running instance')
120
+ raise RuntimeError(
121
+ f'Cluster {cluster_name_on_cloud} has no running instance.')
122
+ logger.info(
123
+ f'Cluster {cluster_name_on_cloud} already has a running instance')
124
+ return common.ProvisionRecord(provider_name=PROVIDER_NAME,
125
+ cluster_name=cluster_name_on_cloud,
126
+ region='default',
127
+ zone=None,
128
+ head_instance_id=instance_id,
129
+ resumed_instance_ids=[],
130
+ created_instance_ids=[])
131
+
132
+ try:
133
+ # Get instance type from node_config
134
+ instance_type = config.node_config.get('InstanceType')
135
+ logger.debug(f'Instance type from config: {instance_type}')
136
+ if not instance_type:
137
+ logger.error('InstanceType is not set in node_config')
138
+ raise RuntimeError(
139
+ 'InstanceType is not set in node_config. '
140
+ 'Please specify an instance type for Hyperbolic.')
141
+
142
+ # Parse gpu_model configuration from instance type
143
+ # Format: {gpu_count}x-{gpu_model}-{cpu}-{memory}
144
+ # Example: 1x-A100-24-271
145
+ try:
146
+ parts = instance_type.split('-')
147
+ if len(parts) != 4:
148
+ raise ValueError(
149
+ f'Invalid instance type format: {instance_type}. '
150
+ 'Expected format: {gpu_count}x-{gpu_model}-{cpu}-{memory}')
151
+
152
+ gpu_count = int(parts[0].rstrip('x'))
153
+ gpu_model = parts[1]
154
+ logger.info(f'Parsed GPU config from instance type: '
155
+ f'model={gpu_model}, count={gpu_count}')
156
+
157
+ # Launch instance
158
+ instance_id, ssh_command = utils.launch_instance(
159
+ gpu_model, gpu_count, cluster_name_on_cloud)
160
+ logger.info(f'Launched instance {instance_id} with SSH command: '
161
+ f'{ssh_command}')
162
+ created_instance_ids = [instance_id]
163
+
164
+ # Wait for instance to be ready
165
+ if not utils.wait_for_instance(
166
+ instance_id, utils.HyperbolicInstanceStatus.ONLINE.value):
167
+ raise RuntimeError(
168
+ f'Instance {instance_id} failed to reach ONLINE state')
169
+
170
+ except ValueError as e:
171
+ logger.error(f'Failed to parse instance type: {e}')
172
+ raise RuntimeError(str(e)) from e
173
+ except Exception as e:
174
+ logger.error(f'Failed to launch instance: {e}')
175
+ raise RuntimeError(str(e)) from e
176
+
177
+ except Exception as e:
178
+ logger.error(f'Unexpected error: {e}')
179
+ raise
180
+
181
+ # Wait for instance to be ready
182
+ logger.info(f'Waiting for instance {instance_id} to be ready')
183
+ while True:
184
+ instances = _filter_instances(
185
+ cluster_name_on_cloud,
186
+ [utils.HyperbolicInstanceStatus.ONLINE.value])
187
+ logger.debug(f'Current instances: {instances}')
188
+ if len(instances) == 1:
189
+ logger.info(f'Instance {instance_id} is ready')
190
+ break
191
+ if time.time() - start_time > TIMEOUT:
192
+ logger.error(
193
+ f'Timed out after {TIMEOUT}s waiting for instance to be ready')
194
+ raise TimeoutError(
195
+ f'Timed out after {TIMEOUT}s waiting for instance to be ready')
196
+ logger.info('Waiting for instance to be ready...')
197
+ time.sleep(POLL_INTERVAL)
198
+
199
+ logger.info(f'Returning ProvisionRecord for instance {instance_id}')
200
+ return common.ProvisionRecord(provider_name=PROVIDER_NAME,
201
+ cluster_name=cluster_name_on_cloud,
202
+ region='default',
203
+ zone=None,
204
+ head_instance_id=instance_id,
205
+ resumed_instance_ids=[],
206
+ created_instance_ids=created_instance_ids)
207
+
208
+
209
+ def terminate_instances(
210
+ cluster_name_on_cloud: str,
211
+ provider_config: Optional[dict] = None,
212
+ worker_only: bool = False,
213
+ ) -> None:
214
+ """Terminate all instances in the cluster."""
215
+ del provider_config, worker_only # unused
216
+ logger.info(
217
+ f'Terminating all instances for cluster {cluster_name_on_cloud}')
218
+
219
+ # First check if instances exist
220
+ instances = _filter_instances(cluster_name_on_cloud, None)
221
+ if not instances:
222
+ logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
223
+ return
224
+
225
+ # Terminate each instance
226
+ for instance_id in instances:
227
+ try:
228
+ utils.terminate_instance(instance_id)
229
+ logger.info(f'Terminated instance {instance_id}')
230
+ except Exception as e: # pylint: disable=broad-except
231
+ logger.warning(f'Failed to terminate instance {instance_id}: {e}')
232
+ continue
233
+
234
+ # Wait for instances to be terminated
235
+ start_time = time.time()
236
+ while True:
237
+ if time.time() - start_time > TIMEOUT:
238
+ logger.error(
239
+ f'Timed out after {TIMEOUT}s waiting for instances to terminate'
240
+ )
241
+ break
242
+
243
+ instances = _filter_instances(
244
+ cluster_name_on_cloud,
245
+ [utils.HyperbolicInstanceStatus.TERMINATED.value])
246
+ if not instances:
247
+ logger.info('All instances terminated successfully')
248
+ break
249
+
250
+ logger.info('Waiting for instances to terminate...')
251
+ time.sleep(POLL_INTERVAL)
252
+
253
+
254
+ def get_cluster_info(
255
+ region: str,
256
+ cluster_name_on_cloud: str,
257
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
258
+ """Returns information about the cluster."""
259
+ del region # unused
260
+ running_instances = _filter_instances(
261
+ cluster_name_on_cloud, [utils.HyperbolicInstanceStatus.ONLINE.value])
262
+ instances: Dict[str, List[common.InstanceInfo]] = {}
263
+ head_instance_id = None
264
+
265
+ for instance_id, instance_info in running_instances.items():
266
+ # Extract hostname and port from sshCommand
267
+ ssh_command = instance_info.get('sshCommand', '')
268
+ if ssh_command:
269
+ # Format: ssh user@hostname -p port
270
+ parts = ssh_command.split()
271
+ if len(parts) >= 4:
272
+ user_host = parts[1] # user@hostname
273
+ if '@' in user_host:
274
+ ssh_user = user_host.split('@')[0]
275
+ hostname = user_host.split('@')[1]
276
+ else:
277
+ hostname = user_host
278
+ port = int(parts[3])
279
+ else:
280
+ hostname = instance_id
281
+ port = 22
282
+ else:
283
+ hostname = instance_id
284
+ port = 22
285
+
286
+ instances[instance_id] = [
287
+ common.InstanceInfo(
288
+ instance_id=instance_id,
289
+ internal_ip=hostname,
290
+ external_ip=hostname,
291
+ ssh_port=port,
292
+ tags={},
293
+ )
294
+ ]
295
+ if head_instance_id is None:
296
+ head_instance_id = instance_id
297
+
298
+ return common.ClusterInfo(
299
+ instances=instances,
300
+ head_instance_id=head_instance_id,
301
+ provider_name=PROVIDER_NAME,
302
+ provider_config=provider_config,
303
+ ssh_user=ssh_user,
304
+ )
305
+
306
+
307
+ def query_instances(
308
+ cluster_name: str,
309
+ cluster_name_on_cloud: str,
310
+ provider_config: Optional[dict] = None,
311
+ non_terminated_only: bool = True,
312
+ retry_if_missing: bool = False,
313
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
314
+ """Returns the status of the specified instances for Hyperbolic."""
315
+ del cluster_name, provider_config, retry_if_missing # unused
316
+ # Fetch all instances for this cluster
317
+ instances = utils.list_instances(
318
+ metadata={'skypilot': {
319
+ 'cluster_name': cluster_name_on_cloud
320
+ }})
321
+ if not instances:
322
+ # No instances found: return empty dict to indicate fully deleted
323
+ return {}
324
+
325
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
326
+ Optional[str]]] = {}
327
+ for instance_id, instance in instances.items():
328
+ try:
329
+ raw_status = instance.get('status', 'unknown').lower()
330
+ hyperbolic_status = utils.HyperbolicInstanceStatus.from_raw_status(
331
+ raw_status)
332
+ status = hyperbolic_status.to_cluster_status()
333
+ if non_terminated_only and status is None:
334
+ continue
335
+ statuses[instance_id] = (status, None)
336
+ except utils.HyperbolicError as e:
337
+ logger.warning(
338
+ f'Failed to parse status for instance {instance_id}: {e}')
339
+ continue
340
+ return statuses
341
+
342
+
343
+ def wait_instances(region: str, cluster_name_on_cloud: str,
344
+ state: Optional[status_lib.ClusterStatus]) -> None:
345
+ """Wait for instances to reach the desired state."""
346
+ del region # unused
347
+ if state == status_lib.ClusterStatus.UP:
348
+ # Check if any instances are in ONLINE state
349
+ instances = _filter_instances(
350
+ cluster_name_on_cloud,
351
+ [utils.HyperbolicInstanceStatus.ONLINE.value])
352
+ if not instances:
353
+ # Check if any instances are in a failed state
354
+ failed_instances = _filter_instances(cluster_name_on_cloud, [
355
+ utils.HyperbolicInstanceStatus.FAILED.value,
356
+ utils.HyperbolicInstanceStatus.ERROR.value
357
+ ])
358
+ if failed_instances:
359
+ raise RuntimeError(
360
+ f'Cluster {cluster_name_on_cloud} has failed instances: '
361
+ f'{failed_instances}')
362
+ raise RuntimeError(f'No running instances found for cluster '
363
+ f'{cluster_name_on_cloud}')
364
+ # Check if any instances are in TERMINATED state
365
+ terminated_instances = _filter_instances(
366
+ cluster_name_on_cloud,
367
+ [utils.HyperbolicInstanceStatus.TERMINATED.value])
368
+ if terminated_instances:
369
+ error_msg = (
370
+ f'Cluster {cluster_name_on_cloud} is in UP state, but '
371
+ f'{len(terminated_instances)} instances are terminated.')
372
+ raise RuntimeError(error_msg)
373
+ elif state == status_lib.ClusterStatus.STOPPED:
374
+ # Check if any instances are in TERMINATED state
375
+ instances = _filter_instances(
376
+ cluster_name_on_cloud,
377
+ [utils.HyperbolicInstanceStatus.TERMINATED.value])
378
+ if not instances:
379
+ # Check if any instances are in a failed state
380
+ failed_instances = _filter_instances(cluster_name_on_cloud, [
381
+ utils.HyperbolicInstanceStatus.FAILED.value,
382
+ utils.HyperbolicInstanceStatus.ERROR.value
383
+ ])
384
+ if failed_instances:
385
+ raise RuntimeError(
386
+ f'Cluster {cluster_name_on_cloud} has failed instances: '
387
+ f'{failed_instances}')
388
+ raise RuntimeError(f'No terminated instances found for cluster '
389
+ f'{cluster_name_on_cloud}')
390
+ # Check if any instances are in ONLINE state
391
+ running_instances = _filter_instances(
392
+ cluster_name_on_cloud,
393
+ [utils.HyperbolicInstanceStatus.ONLINE.value])
394
+ if running_instances:
395
+ error_msg = (
396
+ f'Cluster {cluster_name_on_cloud} is in STOPPED state, but '
397
+ f'{len(running_instances)} instances are running.')
398
+ raise RuntimeError(error_msg)
399
+ else:
400
+ raise RuntimeError(f'Unsupported state: {state}')
401
+
402
+
403
+ def stop_instances(
404
+ cluster_name_on_cloud: str,
405
+ provider_config: Optional[Dict[str, Any]] = None,
406
+ worker_only: bool = False,
407
+ ) -> None:
408
+ """Stop running instances. Not supported for Hyperbolic."""
409
+ raise NotImplementedError('stop_instances is not supported for Hyperbolic')
410
+
411
+
412
+ def cleanup_ports(
413
+ cluster_name_on_cloud: str,
414
+ provider_config: Optional[dict] = None,
415
+ ports: Optional[list] = None,
416
+ ) -> None:
417
+ """Cleanup ports. Not supported for Hyperbolic."""
418
+ raise NotImplementedError('cleanup_ports is not supported for Hyperbolic')
419
+
420
+
421
+ def cleanup_custom_multi_network(
422
+ cluster_name_on_cloud: str,
423
+ provider_config: Dict[str, Any],
424
+ failover: bool = False,
425
+ ) -> None:
426
+ """Cleanup custom multi-network. Not supported for Hyperbolic."""
427
+ raise NotImplementedError(
428
+ 'cleanup_custom_multi_network is not supported for Hyperbolic')
429
+
430
+
431
+ def open_ports(
432
+ cluster_name_on_cloud: str,
433
+ ports: list,
434
+ provider_config: Optional[dict] = None,
435
+ ) -> None:
436
+ """Open ports. Not supported for Hyperbolic."""
437
+ raise NotImplementedError('open_ports is not supported for Hyperbolic')