skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,437 @@
1
+ """Hyperbolic instance provisioning."""
2
+ import time
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ from sky import sky_logging
6
+ from sky.provision import common
7
+ from sky.provision.hyperbolic import utils
8
+ from sky.utils import status_lib
9
+
10
+ PROVIDER_NAME = 'hyperbolic'
11
+ POLL_INTERVAL = 5
12
+ QUERY_PORTS_TIMEOUT_SECONDS = 30
13
+ #TODO come up with a reasonable value for this timeout
14
+ TIMEOUT = 300
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+
19
+ def _filter_instances(cluster_name_on_cloud: str,
20
+ status_filters: Optional[List[str]],
21
+ head_only: bool = False) -> Dict[str, Dict[str, Any]]:
22
+ logger.debug(f'Filtering instances: cluster={cluster_name_on_cloud}, '
23
+ f'status={status_filters}')
24
+ _ = head_only # Mark as intentionally unused
25
+
26
+ # Filter by cluster name using metadata
27
+ instances = utils.list_instances(
28
+ metadata={'skypilot': {
29
+ 'cluster_name': cluster_name_on_cloud
30
+ }})
31
+
32
+ # Normalize status filters to lowercase
33
+ if status_filters is not None:
34
+ status_filters = [s.lower() for s in status_filters]
35
+
36
+ filtered_instances: Dict[str, Dict[str, Any]] = {}
37
+ for instance_id, instance in instances.items():
38
+ try:
39
+ # Check status filter
40
+ instance_status = instance.get('status', '').lower()
41
+ if (status_filters is not None and
42
+ instance_status not in status_filters):
43
+ logger.debug(
44
+ f'Skipping instance {instance_id} '
45
+ f'- status {instance_status} not in {status_filters}')
46
+ continue
47
+
48
+ filtered_instances[instance_id] = instance
49
+ logger.debug(f'Including instance {instance_id} '
50
+ f'with status {instance_status}')
51
+
52
+ except Exception as e: # pylint: disable=broad-except
53
+ logger.warning(f'Error processing instance {instance_id}: {str(e)}')
54
+ continue
55
+
56
+ logger.info(f'Found {len(filtered_instances)} instances matching filters')
57
+ return filtered_instances
58
+
59
+
60
+ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
61
+ """Get the instance ID from the instances dict."""
62
+ if not instances:
63
+ return None
64
+ return next(iter(instances.keys()))
65
+
66
+
67
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
68
+ config: common.ProvisionConfig) -> common.ProvisionRecord:
69
+ del cluster_name # unused
70
+ logger.info(f'Starting run_instances with region={region}, '
71
+ f'cluster={cluster_name_on_cloud}')
72
+ logger.debug(f'Config: {config}')
73
+ start_time = time.time()
74
+
75
+ # Define pending statuses for Hyperbolic
76
+ pending_status = [
77
+ utils.HyperbolicInstanceStatus.CREATING.value,
78
+ utils.HyperbolicInstanceStatus.STARTING.value
79
+ ]
80
+ logger.debug(
81
+ f'Looking for instances with pending statuses: {pending_status}')
82
+
83
+ # Wait for any pending instances to be ready
84
+ while True:
85
+ if time.time() - start_time > TIMEOUT:
86
+ logger.error(
87
+ f'Timed out after {TIMEOUT}s waiting for instances to be ready')
88
+ raise TimeoutError(
89
+ f'Timed out after {TIMEOUT}s waiting for instances to be ready')
90
+
91
+ instances = _filter_instances(cluster_name_on_cloud, pending_status)
92
+ logger.debug(f'Found {len(instances)} instances with pending status')
93
+ if not instances:
94
+ break
95
+ logger.info(
96
+ f'Waiting for instance to be ready. Current instances: {instances}')
97
+ time.sleep(POLL_INTERVAL)
98
+
99
+ # Check existing running instance
100
+ logger.info('Checking for existing running instances')
101
+ exist_instances = _filter_instances(
102
+ cluster_name_on_cloud, [utils.HyperbolicInstanceStatus.ONLINE.value])
103
+ logger.debug(
104
+ f'Found {len(exist_instances)} running instances: {exist_instances}')
105
+ instance_id = _get_head_instance_id(exist_instances)
106
+ logger.debug(f'Head instance ID: {instance_id}')
107
+
108
+ # Calculate if we need to start a new instance
109
+ to_start_count = 1 - len(exist_instances) # Always 1 for single node
110
+ logger.info(f'Need to start {to_start_count} new instances')
111
+ if to_start_count < 0:
112
+ logger.error(
113
+ f'Cluster {cluster_name_on_cloud} already has an instance running')
114
+ raise RuntimeError(
115
+ f'Cluster {cluster_name_on_cloud} already has an instance running.')
116
+ if to_start_count == 0:
117
+ if instance_id is None:
118
+ logger.error(
119
+ f'Cluster {cluster_name_on_cloud} has no running instance')
120
+ raise RuntimeError(
121
+ f'Cluster {cluster_name_on_cloud} has no running instance.')
122
+ logger.info(
123
+ f'Cluster {cluster_name_on_cloud} already has a running instance')
124
+ return common.ProvisionRecord(provider_name=PROVIDER_NAME,
125
+ cluster_name=cluster_name_on_cloud,
126
+ region='default',
127
+ zone=None,
128
+ head_instance_id=instance_id,
129
+ resumed_instance_ids=[],
130
+ created_instance_ids=[])
131
+
132
+ try:
133
+ # Get instance type from node_config
134
+ instance_type = config.node_config.get('InstanceType')
135
+ logger.debug(f'Instance type from config: {instance_type}')
136
+ if not instance_type:
137
+ logger.error('InstanceType is not set in node_config')
138
+ raise RuntimeError(
139
+ 'InstanceType is not set in node_config. '
140
+ 'Please specify an instance type for Hyperbolic.')
141
+
142
+ # Parse gpu_model configuration from instance type
143
+ # Format: {gpu_count}x-{gpu_model}-{cpu}-{memory}
144
+ # Example: 1x-A100-24-271
145
+ try:
146
+ parts = instance_type.split('-')
147
+ if len(parts) != 4:
148
+ raise ValueError(
149
+ f'Invalid instance type format: {instance_type}. '
150
+ 'Expected format: {gpu_count}x-{gpu_model}-{cpu}-{memory}')
151
+
152
+ gpu_count = int(parts[0].rstrip('x'))
153
+ gpu_model = parts[1]
154
+ logger.info(f'Parsed GPU config from instance type: '
155
+ f'model={gpu_model}, count={gpu_count}')
156
+
157
+ # Launch instance
158
+ instance_id, ssh_command = utils.launch_instance(
159
+ gpu_model, gpu_count, cluster_name_on_cloud)
160
+ logger.info(f'Launched instance {instance_id} with SSH command: '
161
+ f'{ssh_command}')
162
+ created_instance_ids = [instance_id]
163
+
164
+ # Wait for instance to be ready
165
+ if not utils.wait_for_instance(
166
+ instance_id, utils.HyperbolicInstanceStatus.ONLINE.value):
167
+ raise RuntimeError(
168
+ f'Instance {instance_id} failed to reach ONLINE state')
169
+
170
+ except ValueError as e:
171
+ logger.error(f'Failed to parse instance type: {e}')
172
+ raise RuntimeError(str(e)) from e
173
+ except Exception as e:
174
+ logger.error(f'Failed to launch instance: {e}')
175
+ raise RuntimeError(str(e)) from e
176
+
177
+ except Exception as e:
178
+ logger.error(f'Unexpected error: {e}')
179
+ raise
180
+
181
+ # Wait for instance to be ready
182
+ logger.info(f'Waiting for instance {instance_id} to be ready')
183
+ while True:
184
+ instances = _filter_instances(
185
+ cluster_name_on_cloud,
186
+ [utils.HyperbolicInstanceStatus.ONLINE.value])
187
+ logger.debug(f'Current instances: {instances}')
188
+ if len(instances) == 1:
189
+ logger.info(f'Instance {instance_id} is ready')
190
+ break
191
+ if time.time() - start_time > TIMEOUT:
192
+ logger.error(
193
+ f'Timed out after {TIMEOUT}s waiting for instance to be ready')
194
+ raise TimeoutError(
195
+ f'Timed out after {TIMEOUT}s waiting for instance to be ready')
196
+ logger.info('Waiting for instance to be ready...')
197
+ time.sleep(POLL_INTERVAL)
198
+
199
+ logger.info(f'Returning ProvisionRecord for instance {instance_id}')
200
+ return common.ProvisionRecord(provider_name=PROVIDER_NAME,
201
+ cluster_name=cluster_name_on_cloud,
202
+ region='default',
203
+ zone=None,
204
+ head_instance_id=instance_id,
205
+ resumed_instance_ids=[],
206
+ created_instance_ids=created_instance_ids)
207
+
208
+
209
+ def terminate_instances(
210
+ cluster_name_on_cloud: str,
211
+ provider_config: Optional[dict] = None,
212
+ worker_only: bool = False,
213
+ ) -> None:
214
+ """Terminate all instances in the cluster."""
215
+ del provider_config, worker_only # unused
216
+ logger.info(
217
+ f'Terminating all instances for cluster {cluster_name_on_cloud}')
218
+
219
+ # First check if instances exist
220
+ instances = _filter_instances(cluster_name_on_cloud, None)
221
+ if not instances:
222
+ logger.info(f'No instances found for cluster {cluster_name_on_cloud}')
223
+ return
224
+
225
+ # Terminate each instance
226
+ for instance_id in instances:
227
+ try:
228
+ utils.terminate_instance(instance_id)
229
+ logger.info(f'Terminated instance {instance_id}')
230
+ except Exception as e: # pylint: disable=broad-except
231
+ logger.warning(f'Failed to terminate instance {instance_id}: {e}')
232
+ continue
233
+
234
+ # Wait for instances to be terminated
235
+ start_time = time.time()
236
+ while True:
237
+ if time.time() - start_time > TIMEOUT:
238
+ logger.error(
239
+ f'Timed out after {TIMEOUT}s waiting for instances to terminate'
240
+ )
241
+ break
242
+
243
+ instances = _filter_instances(
244
+ cluster_name_on_cloud,
245
+ [utils.HyperbolicInstanceStatus.TERMINATED.value])
246
+ if not instances:
247
+ logger.info('All instances terminated successfully')
248
+ break
249
+
250
+ logger.info('Waiting for instances to terminate...')
251
+ time.sleep(POLL_INTERVAL)
252
+
253
+
254
+ def get_cluster_info(
255
+ region: str,
256
+ cluster_name_on_cloud: str,
257
+ provider_config: Optional[Dict[str, Any]] = None) -> common.ClusterInfo:
258
+ """Returns information about the cluster."""
259
+ del region # unused
260
+ running_instances = _filter_instances(
261
+ cluster_name_on_cloud, [utils.HyperbolicInstanceStatus.ONLINE.value])
262
+ instances: Dict[str, List[common.InstanceInfo]] = {}
263
+ head_instance_id = None
264
+
265
+ for instance_id, instance_info in running_instances.items():
266
+ # Extract hostname and port from sshCommand
267
+ ssh_command = instance_info.get('sshCommand', '')
268
+ if ssh_command:
269
+ # Format: ssh user@hostname -p port
270
+ parts = ssh_command.split()
271
+ if len(parts) >= 4:
272
+ user_host = parts[1] # user@hostname
273
+ if '@' in user_host:
274
+ ssh_user = user_host.split('@')[0]
275
+ hostname = user_host.split('@')[1]
276
+ else:
277
+ hostname = user_host
278
+ port = int(parts[3])
279
+ else:
280
+ hostname = instance_id
281
+ port = 22
282
+ else:
283
+ hostname = instance_id
284
+ port = 22
285
+
286
+ instances[instance_id] = [
287
+ common.InstanceInfo(
288
+ instance_id=instance_id,
289
+ internal_ip=hostname,
290
+ external_ip=hostname,
291
+ ssh_port=port,
292
+ tags={},
293
+ )
294
+ ]
295
+ if head_instance_id is None:
296
+ head_instance_id = instance_id
297
+
298
+ return common.ClusterInfo(
299
+ instances=instances,
300
+ head_instance_id=head_instance_id,
301
+ provider_name=PROVIDER_NAME,
302
+ provider_config=provider_config,
303
+ ssh_user=ssh_user,
304
+ )
305
+
306
+
307
+ def query_instances(
308
+ cluster_name: str,
309
+ cluster_name_on_cloud: str,
310
+ provider_config: Optional[dict] = None,
311
+ non_terminated_only: bool = True,
312
+ retry_if_missing: bool = False,
313
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
314
+ """Returns the status of the specified instances for Hyperbolic."""
315
+ del cluster_name, provider_config, retry_if_missing # unused
316
+ # Fetch all instances for this cluster
317
+ instances = utils.list_instances(
318
+ metadata={'skypilot': {
319
+ 'cluster_name': cluster_name_on_cloud
320
+ }})
321
+ if not instances:
322
+ # No instances found: return empty dict to indicate fully deleted
323
+ return {}
324
+
325
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
326
+ Optional[str]]] = {}
327
+ for instance_id, instance in instances.items():
328
+ try:
329
+ raw_status = instance.get('status', 'unknown').lower()
330
+ hyperbolic_status = utils.HyperbolicInstanceStatus.from_raw_status(
331
+ raw_status)
332
+ status = hyperbolic_status.to_cluster_status()
333
+ if non_terminated_only and status is None:
334
+ continue
335
+ statuses[instance_id] = (status, None)
336
+ except utils.HyperbolicError as e:
337
+ logger.warning(
338
+ f'Failed to parse status for instance {instance_id}: {e}')
339
+ continue
340
+ return statuses
341
+
342
+
343
+ def wait_instances(region: str, cluster_name_on_cloud: str,
344
+ state: Optional[status_lib.ClusterStatus]) -> None:
345
+ """Wait for instances to reach the desired state."""
346
+ del region # unused
347
+ if state == status_lib.ClusterStatus.UP:
348
+ # Check if any instances are in ONLINE state
349
+ instances = _filter_instances(
350
+ cluster_name_on_cloud,
351
+ [utils.HyperbolicInstanceStatus.ONLINE.value])
352
+ if not instances:
353
+ # Check if any instances are in a failed state
354
+ failed_instances = _filter_instances(cluster_name_on_cloud, [
355
+ utils.HyperbolicInstanceStatus.FAILED.value,
356
+ utils.HyperbolicInstanceStatus.ERROR.value
357
+ ])
358
+ if failed_instances:
359
+ raise RuntimeError(
360
+ f'Cluster {cluster_name_on_cloud} has failed instances: '
361
+ f'{failed_instances}')
362
+ raise RuntimeError(f'No running instances found for cluster '
363
+ f'{cluster_name_on_cloud}')
364
+ # Check if any instances are in TERMINATED state
365
+ terminated_instances = _filter_instances(
366
+ cluster_name_on_cloud,
367
+ [utils.HyperbolicInstanceStatus.TERMINATED.value])
368
+ if terminated_instances:
369
+ error_msg = (
370
+ f'Cluster {cluster_name_on_cloud} is in UP state, but '
371
+ f'{len(terminated_instances)} instances are terminated.')
372
+ raise RuntimeError(error_msg)
373
+ elif state == status_lib.ClusterStatus.STOPPED:
374
+ # Check if any instances are in TERMINATED state
375
+ instances = _filter_instances(
376
+ cluster_name_on_cloud,
377
+ [utils.HyperbolicInstanceStatus.TERMINATED.value])
378
+ if not instances:
379
+ # Check if any instances are in a failed state
380
+ failed_instances = _filter_instances(cluster_name_on_cloud, [
381
+ utils.HyperbolicInstanceStatus.FAILED.value,
382
+ utils.HyperbolicInstanceStatus.ERROR.value
383
+ ])
384
+ if failed_instances:
385
+ raise RuntimeError(
386
+ f'Cluster {cluster_name_on_cloud} has failed instances: '
387
+ f'{failed_instances}')
388
+ raise RuntimeError(f'No terminated instances found for cluster '
389
+ f'{cluster_name_on_cloud}')
390
+ # Check if any instances are in ONLINE state
391
+ running_instances = _filter_instances(
392
+ cluster_name_on_cloud,
393
+ [utils.HyperbolicInstanceStatus.ONLINE.value])
394
+ if running_instances:
395
+ error_msg = (
396
+ f'Cluster {cluster_name_on_cloud} is in STOPPED state, but '
397
+ f'{len(running_instances)} instances are running.')
398
+ raise RuntimeError(error_msg)
399
+ else:
400
+ raise RuntimeError(f'Unsupported state: {state}')
401
+
402
+
403
+ def stop_instances(
404
+ cluster_name_on_cloud: str,
405
+ provider_config: Optional[Dict[str, Any]] = None,
406
+ worker_only: bool = False,
407
+ ) -> None:
408
+ """Stop running instances. Not supported for Hyperbolic."""
409
+ raise NotImplementedError('stop_instances is not supported for Hyperbolic')
410
+
411
+
412
+ def cleanup_ports(
413
+ cluster_name_on_cloud: str,
414
+ provider_config: Optional[dict] = None,
415
+ ports: Optional[list] = None,
416
+ ) -> None:
417
+ """Cleanup ports. Not supported for Hyperbolic."""
418
+ raise NotImplementedError('cleanup_ports is not supported for Hyperbolic')
419
+
420
+
421
+ def cleanup_custom_multi_network(
422
+ cluster_name_on_cloud: str,
423
+ provider_config: Dict[str, Any],
424
+ failover: bool = False,
425
+ ) -> None:
426
+ """Cleanup custom multi-network. Not supported for Hyperbolic."""
427
+ raise NotImplementedError(
428
+ 'cleanup_custom_multi_network is not supported for Hyperbolic')
429
+
430
+
431
+ def open_ports(
432
+ cluster_name_on_cloud: str,
433
+ ports: list,
434
+ provider_config: Optional[dict] = None,
435
+ ) -> None:
436
+ """Open ports. Not supported for Hyperbolic."""
437
+ raise NotImplementedError('open_ports is not supported for Hyperbolic')