skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,373 @@
1
+ """Hyperbolic API utilities."""
2
+ import enum
3
+ import json
4
+ import os
5
+ import time
6
+ from typing import Any, Dict, Optional, Tuple
7
+
8
+ import requests
9
+
10
+ from sky import authentication
11
+ from sky import sky_logging
12
+ from sky.utils import status_lib
13
+
14
+ #TODO update to prod endpoint
15
+ BASE_URL = 'https://api.hyperbolic.xyz'
16
+ API_KEY_PATH = '~/.hyperbolic/api_key'
17
+
18
+ MAX_RETRIES = 3
19
+ RETRY_DELAY = 2 # seconds
20
+ TIMEOUT = 120
21
+
22
+ logger = sky_logging.init_logger(__name__)
23
+
24
+
25
+ class HyperbolicError(Exception):
26
+ """Base exception for Hyperbolic API errors."""
27
+ pass
28
+
29
+
30
+ class HyperbolicInstanceStatus(enum.Enum):
31
+ """Statuses enum for Hyperbolic instances."""
32
+ UNKNOWN = 'unknown'
33
+ ONLINE = 'online'
34
+ OFFLINE = 'offline'
35
+ STARTING = 'starting'
36
+ STOPPING = 'stopping'
37
+ BUSY = 'busy'
38
+ RESTARTING = 'restarting'
39
+ CREATING = 'creating'
40
+ FAILED = 'failed'
41
+ ERROR = 'error'
42
+ TERMINATED = 'terminated'
43
+
44
+ @classmethod
45
+ def cluster_status_map(
46
+ cls
47
+ ) -> Dict['HyperbolicInstanceStatus', Optional[status_lib.ClusterStatus]]:
48
+ return {
49
+ cls.CREATING: status_lib.ClusterStatus.INIT,
50
+ cls.STARTING: status_lib.ClusterStatus.INIT,
51
+ cls.ONLINE: status_lib.ClusterStatus.UP,
52
+ cls.FAILED: status_lib.ClusterStatus.INIT,
53
+ cls.ERROR: status_lib.ClusterStatus.INIT,
54
+ cls.RESTARTING: status_lib.ClusterStatus.INIT,
55
+ cls.STOPPING: status_lib.ClusterStatus.INIT,
56
+ cls.UNKNOWN: status_lib.ClusterStatus.INIT,
57
+ cls.BUSY: status_lib.ClusterStatus.INIT,
58
+ cls.OFFLINE: status_lib.ClusterStatus.INIT,
59
+ cls.TERMINATED: None,
60
+ }
61
+
62
+ @classmethod
63
+ def from_raw_status(cls, status: str) -> 'HyperbolicInstanceStatus':
64
+ """Convert raw status string to HyperbolicInstanceStatus enum."""
65
+ try:
66
+ return cls(status.lower())
67
+ except ValueError as exc:
68
+ raise HyperbolicError(f'Unknown instance status: {status}') from exc
69
+
70
+ def to_cluster_status(self) -> Optional[status_lib.ClusterStatus]:
71
+ """Convert to SkyPilot cluster status."""
72
+ return self.cluster_status_map().get(self)
73
+
74
+
75
+ class HyperbolicClient:
76
+ """Client for interacting with the Hyperbolic API."""
77
+
78
+ def __init__(self):
79
+ """Initialize the Hyperbolic client with API credentials."""
80
+ cred_path = os.path.expanduser(API_KEY_PATH)
81
+ if not os.path.exists(cred_path):
82
+ raise RuntimeError(f'API key not found at {cred_path}')
83
+ with open(cred_path, 'r', encoding='utf-8') as f:
84
+ self.api_key = f.read().strip()
85
+ self.headers = {'Authorization': f'Bearer {self.api_key}'}
86
+ self.api_url = BASE_URL
87
+
88
+ def _make_request(
89
+ self,
90
+ method: str,
91
+ endpoint: str,
92
+ payload: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
93
+ """Make an API request to Hyperbolic."""
94
+ url = f'{BASE_URL}{endpoint}'
95
+ headers = {
96
+ 'Authorization': f'Bearer {self.api_key}',
97
+ 'Content-Type': 'application/json'
98
+ }
99
+
100
+ # Debug logging for request
101
+ logger.debug(f'Making {method} request to {url}')
102
+ if payload:
103
+ logger.debug(f'Request payload: {json.dumps(payload, indent=2)}')
104
+
105
+ try:
106
+ if method == 'GET':
107
+ response = requests.get(url, headers=headers, timeout=120)
108
+ elif method == 'POST':
109
+ response = requests.post(url,
110
+ headers=headers,
111
+ json=payload,
112
+ timeout=120)
113
+ else:
114
+ raise HyperbolicError(f'Unsupported HTTP method: {method}')
115
+
116
+ # Debug logging for response
117
+ logger.debug(f'Response status code: {response.status_code}')
118
+ logger.debug(f'Response headers: {dict(response.headers)}')
119
+
120
+ # Try to parse response as JSON
121
+ try:
122
+ response_data = response.json()
123
+ logger.debug(
124
+ f'Response body: {json.dumps(response_data, indent=2)}')
125
+ except json.JSONDecodeError as exc:
126
+ # If response is not JSON, use the raw text
127
+ response_text = response.text
128
+ logger.debug(f'Response body (raw): {response_text}')
129
+ if not response.ok:
130
+ raise HyperbolicError(f'API request failed with status '
131
+ f'{response.status_code}: '
132
+ f'{response_text}') from exc
133
+ # If response is OK but not JSON, return empty dict
134
+ return {}
135
+
136
+ if not response.ok:
137
+ error_msg = response_data.get(
138
+ 'error', response_data.get('message', response.text))
139
+ raise HyperbolicError(
140
+ f'API request failed with status {response.status_code}: '
141
+ f'{error_msg}')
142
+
143
+ return response_data
144
+ except requests.exceptions.RequestException as e:
145
+ raise HyperbolicError(f'Request failed: {str(e)}') from e
146
+ except Exception as e:
147
+ raise HyperbolicError(
148
+ f'Unexpected error during API request: {str(e)}') from e
149
+
150
+ def launch_instance(self, gpu_model: str, gpu_count: int,
151
+ name: str) -> Tuple[str, str]:
152
+ """Launch a new instance with the specified configuration."""
153
+ # Initialize config with basic instance info
154
+ config = {
155
+ 'gpuModel': gpu_model,
156
+ 'gpuCount': str(gpu_count),
157
+ 'userMetadata': {
158
+ 'skypilot': {
159
+ 'cluster_name': name,
160
+ 'launch_time': str(int(time.time()))
161
+ }
162
+ }
163
+ }
164
+
165
+ config = authentication.setup_hyperbolic_authentication(config)
166
+
167
+ endpoint = '/v2/marketplace/instances/create-cheapest'
168
+ try:
169
+ response = self._make_request('POST', endpoint, payload=config)
170
+ logger.debug(f'Launch response: {json.dumps(response, indent=2)}')
171
+
172
+ instance_id = response.get('instanceName')
173
+ if not instance_id:
174
+ logger.error(f'No instance ID in response: {response}')
175
+ raise HyperbolicError('No instance ID returned from API')
176
+
177
+ logger.info(f'Successfully launched instance {instance_id}, '
178
+ f'waiting for it to be ready...')
179
+
180
+ # Wait for instance to be ready
181
+ if not self.wait_for_instance(
182
+ instance_id, HyperbolicInstanceStatus.ONLINE.value):
183
+ raise HyperbolicError(
184
+ f'Instance {instance_id} failed to reach ONLINE state')
185
+
186
+ # Get instance details to get SSH command
187
+ instances = self.list_instances(
188
+ metadata={'skypilot': {
189
+ 'cluster_name': name
190
+ }})
191
+ instance = instances.get(instance_id)
192
+ if not instance:
193
+ raise HyperbolicError(
194
+ f'Instance {instance_id} not found after launch')
195
+
196
+ ssh_command = instance.get('sshCommand')
197
+ if not ssh_command:
198
+ logger.error(
199
+ f'No SSH command available for instance {instance_id}')
200
+ raise HyperbolicError('No SSH command available for instance')
201
+
202
+ logger.info(f'Instance {instance_id} is ready with SSH command')
203
+ return instance_id, ssh_command
204
+
205
+ except Exception as e:
206
+ logger.error(f'Failed to launch instance: {str(e)}')
207
+ raise HyperbolicError(f'Failed to launch instance: {str(e)}') from e
208
+
209
+ def list_instances(
210
+ self,
211
+ status: Optional[str] = None,
212
+ metadata: Optional[Dict[str, Dict[str, str]]] = None
213
+ ) -> Dict[str, Dict[str, Any]]:
214
+ """List all instances, optionally filtered by status and metadata."""
215
+ endpoint = '/v1/marketplace/instances'
216
+ try:
217
+ response = self._make_request('GET', endpoint)
218
+ logger.debug(f'Raw API response: {json.dumps(response, indent=2)}')
219
+ instances = {}
220
+ for instance in response.get('instances', []):
221
+ instance_info = instance.get('instance', {})
222
+ current_status = instance_info.get('status')
223
+ logger.debug(
224
+ f'Instance {instance.get("id")} status: {current_status}')
225
+
226
+ # Convert raw status to enum
227
+ try:
228
+ instance_status = HyperbolicInstanceStatus.from_raw_status(
229
+ current_status)
230
+ except HyperbolicError as e:
231
+ logger.warning(f'Failed to parse status for instance '
232
+ f'{instance.get("id")}: {e}')
233
+ continue
234
+
235
+ if status and instance_status.value != status.lower():
236
+ continue
237
+
238
+ if metadata:
239
+ skypilot_metadata: Dict[str,
240
+ str] = metadata.get('skypilot', {})
241
+ cluster_name = skypilot_metadata.get('cluster_name', '')
242
+ instance_skypilot = instance.get('userMetadata',
243
+ {}).get('skypilot', {})
244
+ if not instance_skypilot.get('cluster_name',
245
+ '').startswith(cluster_name):
246
+ logger.debug(
247
+ f'Skipping instance {instance.get("id")} - '
248
+ f'skypilot metadata {instance_skypilot} '
249
+ f'does not match {skypilot_metadata}')
250
+ continue
251
+ logger.debug(f'Including instance {instance.get("id")} '
252
+ f'- skypilot metadata matches')
253
+
254
+ hardware = instance_info.get('hardware', {})
255
+ instances[instance.get('id')] = {
256
+ 'id': instance.get('id'),
257
+ 'created': instance.get('created'),
258
+ 'sshCommand': instance.get('sshCommand'),
259
+ 'status': instance_status.value,
260
+ 'gpu_count': instance_info.get('gpu_count'),
261
+ 'gpus_total': instance_info.get('gpus_total'),
262
+ 'owner': instance_info.get('owner'),
263
+ 'cpus': hardware.get('cpus'),
264
+ 'gpus': hardware.get('gpus'),
265
+ 'ram': hardware.get('ram'),
266
+ 'storage': hardware.get('storage'),
267
+ 'pricing': instance_info.get('pricing'),
268
+ 'metadata': instance.get('userMetadata', {})
269
+ }
270
+ return instances
271
+ except Exception as e:
272
+ raise HyperbolicError(f'Failed to list instances: {str(e)}') from e
273
+
274
+ def terminate_instance(self, instance_id: str) -> None:
275
+ """Terminate an instance by ID."""
276
+ endpoint = '/v1/marketplace/instances/terminate'
277
+ data = {'id': instance_id}
278
+ try:
279
+ self._make_request('POST', endpoint, payload=data)
280
+ except Exception as e:
281
+ raise HyperbolicError(
282
+ f'Failed to terminate instance {instance_id}: {str(e)}') from e
283
+
284
+ def wait_for_instance(self,
285
+ instance_id: str,
286
+ target_status: str,
287
+ timeout: int = TIMEOUT) -> bool:
288
+ """Wait for an instance to reach a specific status."""
289
+ start_time = time.time()
290
+ target_status_enum = HyperbolicInstanceStatus.from_raw_status(
291
+ target_status)
292
+ logger.info(
293
+ f'Waiting for instance {instance_id} '
294
+ f'to reach status {target_status_enum.value} and have SSH command')
295
+
296
+ while True:
297
+ elapsed = time.time() - start_time
298
+ if elapsed >= timeout:
299
+ logger.error(f'Timeout after {int(elapsed)}s '
300
+ f'waiting for instance {instance_id}')
301
+ return False
302
+
303
+ try:
304
+ instances = self.list_instances()
305
+ instance = instances.get(instance_id)
306
+
307
+ if not instance:
308
+ logger.warning(f'Instance {instance_id} not found')
309
+ time.sleep(5)
310
+ continue
311
+
312
+ current_status = instance.get('status', '').lower()
313
+ ssh_command = instance.get('sshCommand')
314
+ logger.debug(f'Current status: {current_status}, '
315
+ f'Target status: {target_status_enum.value}, '
316
+ f'SSH command: {ssh_command}')
317
+
318
+ if current_status == target_status_enum.value and ssh_command:
319
+ logger.info(f'Instance {instance_id} reached '
320
+ f'target status {target_status_enum.value} '
321
+ f'and has SSH command after {int(elapsed)}s')
322
+ return True
323
+
324
+ if current_status in ['failed', 'error', 'terminated']:
325
+ logger.error(f'Instance {instance_id} reached '
326
+ f'terminal status: {current_status} '
327
+ f'after {int(elapsed)}s')
328
+ return False
329
+
330
+ time.sleep(5)
331
+ except Exception as e: # pylint: disable=broad-except
332
+ logger.warning(
333
+ f'Error while waiting for instance {instance_id}: {str(e)}')
334
+ time.sleep(5)
335
+
336
+
337
+ # Module-level singleton client
338
+ _client = None
339
+
340
+
341
+ def get_client() -> HyperbolicClient:
342
+ """Get or create the Hyperbolic client singleton."""
343
+ global _client
344
+ if _client is None:
345
+ _client = HyperbolicClient()
346
+ return _client
347
+
348
+
349
+ # Backward-compatible wrapper functions
350
+ def launch_instance(gpu_model: str, gpu_count: int,
351
+ name: str) -> Tuple[str, str]:
352
+ """Launch a new instance with the specified configuration."""
353
+ return get_client().launch_instance(gpu_model, gpu_count, name)
354
+
355
+
356
+ def list_instances(
357
+ status: Optional[str] = None,
358
+ metadata: Optional[Dict[str, Dict[str, str]]] = None
359
+ ) -> Dict[str, Dict[str, Any]]:
360
+ """List all instances, optionally filtered by status and metadata."""
361
+ return get_client().list_instances(status=status, metadata=metadata)
362
+
363
+
364
+ def terminate_instance(instance_id: str) -> None:
365
+ """Terminate an instance by ID."""
366
+ return get_client().terminate_instance(instance_id)
367
+
368
+
369
+ def wait_for_instance(instance_id: str,
370
+ target_status: str,
371
+ timeout: int = TIMEOUT) -> bool:
372
+ """Wait for an instance to reach a specific status."""
373
+ return get_client().wait_for_instance(instance_id, target_status, timeout)
@@ -8,7 +8,9 @@ import time
8
8
  from typing import Any, Callable, Dict, List, Optional, Tuple
9
9
 
10
10
  from sky import exceptions
11
+ from sky import logs
11
12
  from sky import provision
13
+ from sky import resources as resources_lib
12
14
  from sky import sky_logging
13
15
  from sky.provision import common
14
16
  from sky.provision import docker_utils
@@ -21,6 +23,7 @@ from sky.utils import accelerator_registry
21
23
  from sky.utils import command_runner
22
24
  from sky.utils import common_utils
23
25
  from sky.utils import env_options
26
+ from sky.utils import resources_utils
24
27
  from sky.utils import subprocess_utils
25
28
  from sky.utils import timeline
26
29
  from sky.utils import ux_utils
@@ -82,7 +85,7 @@ def _set_usage_run_id_cmd() -> str:
82
85
  latest one when the function is called.
83
86
  """
84
87
  return (
85
- f'cat {usage_constants.USAGE_RUN_ID_FILE} || '
88
+ f'cat {usage_constants.USAGE_RUN_ID_FILE} 2> /dev/null || '
86
89
  # The run id is retrieved locally for the current run, so that the
87
90
  # remote cluster will be set with the same run id as the initial
88
91
  # launch operation.
@@ -90,12 +93,6 @@ def _set_usage_run_id_cmd() -> str:
90
93
  f'{usage_constants.USAGE_RUN_ID_FILE}')
91
94
 
92
95
 
93
- def _set_skypilot_env_var_cmd() -> str:
94
- """Sets the skypilot environment variables on the remote machine."""
95
- env_vars = env_options.Options.all_options()
96
- return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
97
-
98
-
99
96
  def _auto_retry(should_retry: Callable[[Exception], bool] = lambda _: True):
100
97
  """Decorator that retries the function if it fails.
101
98
 
@@ -134,6 +131,20 @@ def _hint_worker_log_path(cluster_name: str, cluster_info: common.ClusterInfo,
134
131
  logger.info(f'Logs of worker nodes can be found at: {worker_log_path}')
135
132
 
136
133
 
134
+ class SSHThreadPoolExecutor(futures.ThreadPoolExecutor):
135
+ """ThreadPoolExecutor that kills children processes on exit."""
136
+
137
+ def __exit__(self, exc_type, exc_val, exc_tb):
138
+ # ssh command runner eventually calls
139
+ # log_lib.run_with_log, which will spawn
140
+ # subprocesses. If we are exiting the context
141
+ # we need to kill the children processes
142
+ # to avoid leakage.
143
+ subprocess_utils.kill_children_processes()
144
+ self.shutdown()
145
+ return False
146
+
147
+
137
148
  def _parallel_ssh_with_cache(func,
138
149
  cluster_name: str,
139
150
  stage_name: str,
@@ -146,7 +157,7 @@ def _parallel_ssh_with_cache(func,
146
157
  # as 32 is too large for some machines.
147
158
  max_workers = subprocess_utils.get_parallel_threads(
148
159
  cluster_info.provider_name)
149
- with futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
160
+ with SSHThreadPoolExecutor(max_workers=max_workers) as pool:
150
161
  results = []
151
162
  runners = provision.get_command_runners(cluster_info.provider_name,
152
163
  cluster_info, **ssh_credentials)
@@ -423,8 +434,16 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
423
434
  # use the external IP of the head node.
424
435
  use_external_ip = cluster_info.custom_ray_options.pop(
425
436
  'use_external_ip', False)
426
- head_ip = (head_instance.internal_ip
427
- if not use_external_ip else head_instance.external_ip)
437
+
438
+ if use_external_ip:
439
+ head_ip = head_instance.external_ip
440
+ else:
441
+ # For Kubernetes, use the internal service address of the head node.
442
+ # Keep this consistent with the logic in kubernetes-ray.yml.j2
443
+ if head_instance.internal_svc:
444
+ head_ip = head_instance.internal_svc
445
+ else:
446
+ head_ip = head_instance.internal_ip
428
447
 
429
448
  ray_cmd = ray_worker_start_command(custom_resource,
430
449
  cluster_info.custom_ray_options,
@@ -466,11 +485,38 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
466
485
  @common.log_function_start_end
467
486
  @_auto_retry()
468
487
  @timeline.event
469
- def start_skylet_on_head_node(cluster_name: str,
470
- cluster_info: common.ClusterInfo,
471
- ssh_credentials: Dict[str, Any]) -> None:
488
+ def start_skylet_on_head_node(
489
+ cluster_name: resources_utils.ClusterName,
490
+ cluster_info: common.ClusterInfo, ssh_credentials: Dict[str, Any],
491
+ launched_resources: resources_lib.Resources) -> None:
472
492
  """Start skylet on the head node."""
473
- del cluster_name
493
+ # Avoid circular import.
494
+ # pylint: disable=import-outside-toplevel
495
+ from sky.utils import controller_utils
496
+
497
+ def _set_skypilot_env_var_cmd() -> str:
498
+ """Sets the skypilot environment variables on the remote machine."""
499
+ env_vars = {
500
+ k: str(v) for (k, v) in env_options.Options.all_options().items()
501
+ }
502
+ is_controller = controller_utils.Controllers.from_name(
503
+ cluster_name.display_name) is not None
504
+ is_kubernetes = cluster_info.provider_name == 'kubernetes'
505
+ if is_controller and is_kubernetes:
506
+ # For jobs/serve controller, we pass in the CPU and memory limits
507
+ # when starting the skylet to handle cases where these env vars
508
+ # are not set on the cluster's pod spec. The skylet will read
509
+ # these env vars when starting (ManagedJobEvent.start()) and write
510
+ # it to disk.
511
+ resources = launched_resources.assert_launchable()
512
+ vcpus, mem = resources.cloud.get_vcpus_mem_from_instance_type(
513
+ resources.instance_type)
514
+ if vcpus is not None:
515
+ env_vars['SKYPILOT_POD_CPU_CORE_LIMIT'] = str(vcpus)
516
+ if mem is not None:
517
+ env_vars['SKYPILOT_POD_MEMORY_GB_LIMIT'] = str(mem)
518
+ return '; '.join([f'export {k}={v}' for k, v in env_vars.items()])
519
+
474
520
  runners = provision.get_command_runners(cluster_info.provider_name,
475
521
  cluster_info, **ssh_credentials)
476
522
  head_runner = runners[0]
@@ -557,3 +603,36 @@ def internal_file_mounts(cluster_name: str, common_file_mounts: Dict[str, str],
557
603
  ssh_credentials=ssh_credentials,
558
604
  max_workers=subprocess_utils.get_max_workers_for_file_mounts(
559
605
  common_file_mounts, cluster_info.provider_name))
606
+
607
+
608
+ @common.log_function_start_end
609
+ @timeline.event
610
+ def setup_logging_on_cluster(logging_agent: logs.LoggingAgent,
611
+ cluster_name: resources_utils.ClusterName,
612
+ cluster_info: common.ClusterInfo,
613
+ ssh_credentials: Dict[str, Any]) -> None:
614
+ """Setup logging agent (fluentbit) on all nodes after provisioning."""
615
+ _hint_worker_log_path(cluster_name.name_on_cloud, cluster_info,
616
+ 'logging_setup')
617
+
618
+ @_auto_retry()
619
+ def _setup_node(runner: command_runner.CommandRunner, log_path: str):
620
+ cmd = logging_agent.get_setup_command(cluster_name)
621
+ logger.info(f'Running command on node: {cmd}')
622
+ returncode, stdout, stderr = runner.run(cmd,
623
+ stream_logs=False,
624
+ require_outputs=True,
625
+ log_path=log_path,
626
+ source_bashrc=True)
627
+ if returncode:
628
+ raise RuntimeError(f'Failed to setup logging agent\n{cmd}\n'
629
+ f'(exit code {returncode}). Error: '
630
+ f'===== stdout ===== \n{stdout}\n'
631
+ f'===== stderr ====={stderr}')
632
+
633
+ _parallel_ssh_with_cache(_setup_node,
634
+ cluster_name.name_on_cloud,
635
+ stage_name='logging_setup',
636
+ digest=None,
637
+ cluster_info=cluster_info,
638
+ ssh_credentials=ssh_credentials)
@@ -11,3 +11,8 @@ from sky.provision.kubernetes.instance import wait_instances
11
11
  from sky.provision.kubernetes.network import cleanup_ports
12
12
  from sky.provision.kubernetes.network import open_ports
13
13
  from sky.provision.kubernetes.network import query_ports
14
+ from sky.provision.kubernetes.volume import apply_volume
15
+ from sky.provision.kubernetes.volume import delete_volume
16
+ from sky.provision.kubernetes.volume import get_all_volumes_usedby
17
+ from sky.provision.kubernetes.volume import get_volume_usedby
18
+ from sky.provision.kubernetes.volume import map_all_volumes_usedby
@@ -3,20 +3,12 @@ import copy
3
3
  import logging
4
4
  import math
5
5
  import os
6
- import typing
7
- from typing import Any, Dict, Optional, Union
6
+ from typing import Any, Dict, List, Optional, Union
8
7
 
9
- from sky.adaptors import common as adaptors_common
10
8
  from sky.adaptors import kubernetes
11
9
  from sky.provision import common
12
- from sky.provision.kubernetes import network_utils
13
10
  from sky.provision.kubernetes import utils as kubernetes_utils
14
- from sky.utils import kubernetes_enums
15
-
16
- if typing.TYPE_CHECKING:
17
- import yaml
18
- else:
19
- yaml = adaptors_common.LazyImport('yaml')
11
+ from sky.utils import yaml_utils
20
12
 
21
13
  logger = logging.getLogger(__name__)
22
14
 
@@ -34,11 +26,6 @@ def bootstrap_instances(
34
26
 
35
27
  _configure_services(namespace, context, config.provider_config)
36
28
 
37
- networking_mode = network_utils.get_networking_mode(
38
- config.provider_config.get('networking_mode'))
39
- if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
40
- config = _configure_ssh_jump(namespace, context, config)
41
-
42
29
  requested_service_account = config.node_config['spec']['serviceAccountName']
43
30
  if (requested_service_account ==
44
31
  kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
@@ -487,41 +474,6 @@ def _configure_autoscaler_cluster_role_binding(
487
474
  f'{created_msg(binding_field, name)}')
488
475
 
489
476
 
490
- def _configure_ssh_jump(namespace, context, config: common.ProvisionConfig):
491
- """Creates a SSH jump pod to connect to the cluster.
492
-
493
- Also updates config['auth']['ssh_proxy_command'] to use the newly created
494
- jump pod.
495
- """
496
- provider_config = config.provider_config
497
- pod_cfg = config.node_config
498
-
499
- ssh_jump_name = pod_cfg['metadata']['labels']['skypilot-ssh-jump']
500
- ssh_jump_image = provider_config['ssh_jump_image']
501
-
502
- volumes = pod_cfg['spec']['volumes']
503
- # find 'secret-volume' and get the secret name
504
- secret_volume = next(filter(lambda x: x['name'] == 'secret-volume',
505
- volumes))
506
- ssh_key_secret_name = secret_volume['secret']['secretName']
507
-
508
- # TODO(romilb): We currently split SSH jump pod and svc creation. Service
509
- # is first created in authentication.py::setup_kubernetes_authentication
510
- # and then SSH jump pod creation happens here. This is because we need to
511
- # set the ssh_proxy_command in the ray YAML before we pass it to the
512
- # autoscaler. If in the future if we can write the ssh_proxy_command to the
513
- # cluster yaml through this method, then we should move the service
514
- # creation here.
515
-
516
- # TODO(romilb): We should add a check here to make sure the service is up
517
- # and available before we create the SSH jump pod. If for any reason the
518
- # service is missing, we should raise an error.
519
-
520
- kubernetes_utils.setup_ssh_jump_pod(ssh_jump_name, ssh_jump_image,
521
- ssh_key_secret_name, namespace, context)
522
- return config
523
-
524
-
525
477
  def _configure_skypilot_system_namespace(
526
478
  provider_config: Dict[str, Any]) -> None:
527
479
  """Creates the namespace for skypilot-system mounting if it does not exist.
@@ -592,7 +544,7 @@ def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
592
544
  daemonset_path = os.path.join(
593
545
  root_dir, 'kubernetes/manifests/fusermount-server-daemonset.yaml')
594
546
  with open(daemonset_path, 'r', encoding='utf-8') as file:
595
- daemonset = yaml.safe_load(file)
547
+ daemonset = yaml_utils.safe_load(file)
596
548
  kubernetes_utils.merge_custom_metadata(daemonset['metadata'])
597
549
  try:
598
550
  kubernetes.apps_api(context).create_namespaced_daemon_set(
@@ -672,4 +624,9 @@ def _configure_services(namespace: str, context: Optional[str],
672
624
 
673
625
 
674
626
  class KubernetesError(Exception):
675
- pass
627
+
628
+ def __init__(self,
629
+ *args,
630
+ insufficent_resources: Optional[List[str]] = None):
631
+ self.insufficent_resources = insufficent_resources
632
+ super().__init__(*args)
@@ -6,3 +6,20 @@ NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure '
6
6
  '(e.g., skypilot.co/accelerator) are setup correctly. ')
7
7
 
8
8
  KUBERNETES_IN_CLUSTER_NAMESPACE_ENV_VAR = 'SKYPILOT_IN_CLUSTER_NAMESPACE'
9
+
10
+ # Name of kubernetes exec auth wrapper script
11
+ SKY_K8S_EXEC_AUTH_WRAPPER = 'sky-kube-exec-wrapper'
12
+
13
+ # PATH envvar for kubectl exec auth execve
14
+ SKY_K8S_EXEC_AUTH_PATH = '$HOME/skypilot-runtime/bin:$HOME/google-cloud-sdk/bin:$PATH' # pylint: disable=line-too-long
15
+
16
+ # cache directory for kubeconfig with modified exec auth
17
+ SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE = '~/.sky/generated/kubeconfigs'
18
+
19
+ # Labels for the Pods created by SkyPilot
20
+ TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
21
+ TAG_POD_INITIALIZED = 'skypilot-initialized'
22
+ TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
23
+
24
+ # Pod phases that are not holding PVCs
25
+ PVC_NOT_HOLD_POD_PHASES = ['Succeeded', 'Failed']