skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/utils/log_utils.py CHANGED
@@ -47,13 +47,16 @@ class RayUpLineProcessor(LineProcessor):
47
47
  RUNTIME_SETUP = 1
48
48
  PULLING_DOCKER_IMAGES = 2
49
49
 
50
- def __init__(self, log_path: str):
50
+ def __init__(self, log_path: str, cluster_name: Optional[str] = None):
51
51
  self.log_path = log_path
52
+ self.cluster_name = cluster_name
52
53
 
53
54
  def __enter__(self) -> None:
54
55
  self.state = self.ProvisionStatus.LAUNCH
55
56
  self.status_display = rich_utils.safe_status(
56
- ux_utils.spinner_message('Launching', self.log_path))
57
+ ux_utils.spinner_message('Launching',
58
+ self.log_path,
59
+ cluster_name=self.cluster_name))
57
60
  self.status_display.start()
58
61
 
59
62
  def process_line(self, log_line: str) -> None:
@@ -62,19 +65,25 @@ class RayUpLineProcessor(LineProcessor):
62
65
  logger.info(' Head VM is up.')
63
66
  self.status_display.update(
64
67
  ux_utils.spinner_message(
65
- 'Launching - Preparing SkyPilot runtime', self.log_path))
68
+ 'Launching - Preparing SkyPilot runtime',
69
+ self.log_path,
70
+ cluster_name=self.cluster_name))
66
71
  self.state = self.ProvisionStatus.RUNTIME_SETUP
67
72
  if ('Pulling from' in log_line and
68
73
  self.state == self.ProvisionStatus.RUNTIME_SETUP):
69
74
  self.status_display.update(
70
75
  ux_utils.spinner_message(
71
- 'Launching - Initializing docker container', self.log_path))
76
+ 'Launching - Initializing docker container',
77
+ self.log_path,
78
+ cluster_name=self.cluster_name))
72
79
  self.state = self.ProvisionStatus.PULLING_DOCKER_IMAGES
73
80
  if ('Status: Downloaded newer image' in log_line and
74
81
  self.state == self.ProvisionStatus.PULLING_DOCKER_IMAGES):
75
82
  self.status_display.update(
76
83
  ux_utils.spinner_message(
77
- 'Launching - Preparing SkyPilot runtime', self.log_path))
84
+ 'Launching - Preparing SkyPilot runtime',
85
+ self.log_path,
86
+ cluster_name=self.cluster_name))
78
87
  self.state = self.ProvisionStatus.RUNTIME_SETUP
79
88
 
80
89
  def __exit__(self, except_type: Optional[Type[BaseException]],
@@ -189,108 +198,6 @@ class SkyLocalUpLineProcessor(LineProcessor):
189
198
  self.status_display.stop()
190
199
 
191
200
 
192
- class SkyRemoteUpLineProcessor(LineProcessor):
193
- """A processor for deploy_remote_cluster.sh log lines."""
194
-
195
- def __init__(self, log_path: str, is_local: bool):
196
- self.log_path = log_path
197
- self.is_local = is_local
198
-
199
- def __enter__(self) -> None:
200
- # TODO(romilb): Use ux_utils.INDENT_SYMBOL to be consistent with other
201
- # messages.
202
- status = rich_utils.safe_status(
203
- ux_utils.spinner_message('Creating remote cluster',
204
- log_path=self.log_path,
205
- is_local=self.is_local))
206
- self.status_display = status
207
- self.status_display.start()
208
-
209
- def process_line(self, log_line: str) -> None:
210
- # Pre-flight checks
211
- if 'SSH connection successful' in log_line:
212
- logger.info(f'{colorama.Fore.GREEN}SSH connection established.'
213
- f'{colorama.Style.RESET_ALL}')
214
-
215
- # Kubernetes installation steps
216
- if 'Deploying Kubernetes on head node' in log_line:
217
- self.status_display.update(
218
- ux_utils.spinner_message(
219
- 'Creating remote cluster - '
220
- 'deploying Kubernetes on head node',
221
- log_path=self.log_path,
222
- is_local=self.is_local))
223
- if 'K3s deployed on head node.' in log_line:
224
- logger.info(f'{colorama.Fore.GREEN}'
225
- '✔ K3s successfully deployed on head node.'
226
- f'{colorama.Style.RESET_ALL}')
227
-
228
- # Worker nodes
229
- if 'Deploying Kubernetes on worker node' in log_line:
230
- self.status_display.update(
231
- ux_utils.spinner_message(
232
- 'Creating remote cluster - '
233
- 'deploying Kubernetes on worker nodes',
234
- log_path=self.log_path,
235
- is_local=self.is_local))
236
- if 'Kubernetes deployed on worker node' in log_line:
237
- logger.info(f'{colorama.Fore.GREEN}'
238
- '✔ K3s successfully deployed on worker node.'
239
- f'{colorama.Style.RESET_ALL}')
240
-
241
- # Cluster configuration
242
- if 'Configuring local kubectl to connect to the cluster...' in log_line:
243
- self.status_display.update(
244
- ux_utils.spinner_message(
245
- 'Creating remote cluster - '
246
- 'configuring local kubectl',
247
- log_path=self.log_path,
248
- is_local=self.is_local))
249
- if 'kubectl configured to connect to the cluster.' in log_line:
250
- logger.info(f'{colorama.Fore.GREEN}'
251
- '✔ kubectl configured for the remote cluster.'
252
- f'{colorama.Style.RESET_ALL}')
253
-
254
- # GPU operator installation
255
- if 'Installing Nvidia GPU Operator...' in log_line:
256
- self.status_display.update(
257
- ux_utils.spinner_message(
258
- 'Creating remote cluster - '
259
- 'installing Nvidia GPU Operator',
260
- log_path=self.log_path,
261
- is_local=self.is_local))
262
- if 'GPU Operator installed.' in log_line:
263
- logger.info(f'{colorama.Fore.GREEN}'
264
- '✔ Nvidia GPU Operator installed successfully.'
265
- f'{colorama.Style.RESET_ALL}')
266
-
267
- # Cleanup steps
268
- if 'Cleaning up head node' in log_line:
269
- self.status_display.update(
270
- ux_utils.spinner_message('Cleaning up head node',
271
- log_path=self.log_path,
272
- is_local=self.is_local))
273
- if 'Cleaning up node' in log_line:
274
- self.status_display.update(
275
- ux_utils.spinner_message('Cleaning up worker node',
276
- log_path=self.log_path,
277
- is_local=self.is_local))
278
- if 'cleaned up successfully' in log_line:
279
- logger.info(f'{colorama.Fore.GREEN}'
280
- f'{log_line.strip()}{colorama.Style.RESET_ALL}')
281
-
282
- # Final status
283
- if 'Cluster deployment completed.' in log_line:
284
- logger.info(f'{colorama.Fore.GREEN}✔ Remote k3s is running.'
285
- f'{colorama.Style.RESET_ALL}')
286
-
287
- def __exit__(self, except_type: Optional[Type[BaseException]],
288
- except_value: Optional[BaseException],
289
- traceback: Optional[types.TracebackType]) -> None:
290
- del except_type, except_value, traceback # unused
291
- self.status_display.stop()
292
-
293
-
294
201
  def create_table(field_names: List[str], **kwargs) -> prettytable.PrettyTable:
295
202
  """Creates table with default style."""
296
203
  border = kwargs.pop('border', False)
@@ -356,6 +263,74 @@ def readable_time_duration(start: Optional[float],
356
263
  return diff
357
264
 
358
265
 
266
+ def human_duration(start: int, end: Optional[int] = None) -> str:
267
+ """Calculates the time elapsed between two timestamps and returns
268
+ it as a human-readable string, similar to Kubernetes' duration format.
269
+
270
+ Args:
271
+ start: The start time as a Unix timestamp (seconds since epoch).
272
+ end: The end time as a Unix timestamp (seconds since epoch).
273
+ If None, current time is used.
274
+
275
+ Returns:
276
+ A string representing the duration, e.g., "2d3h", "15m", "30s".
277
+ Returns "0s" for zero, negative durations, or if the timestamp
278
+ is invalid.
279
+ """
280
+ if not start or start <= 0:
281
+ return '0s'
282
+
283
+ if end is None:
284
+ end = int(time.time())
285
+ duration_seconds = end - start
286
+
287
+ units = {
288
+ 'y': 365 * 24 * 60 * 60,
289
+ 'd': 60 * 60 * 24,
290
+ 'h': 60 * 60,
291
+ 'm': 60,
292
+ 's': 1,
293
+ }
294
+
295
+ if duration_seconds <= 0:
296
+ return '0s'
297
+ elif duration_seconds < 60 * 2:
298
+ return f'{duration_seconds}s'
299
+
300
+ minutes = int(duration_seconds / units['m'])
301
+ if minutes < 10:
302
+ s = int(duration_seconds / units['s']) % 60
303
+ if s == 0:
304
+ return f'{minutes}m'
305
+ return f'{minutes}m{s}s'
306
+ elif minutes < 60 * 3:
307
+ return f'{minutes}m'
308
+
309
+ hours = int(duration_seconds / units['h'])
310
+ days = int(hours / 24)
311
+ years = int(hours / 24 / 365)
312
+ if hours < 8:
313
+ m = int(duration_seconds / units['m']) % 60
314
+ if m == 0:
315
+ return f'{hours}h'
316
+ return f'{hours}h{m}m'
317
+ elif hours < 48:
318
+ return f'{hours}h'
319
+ elif hours < 24 * 8:
320
+ h = hours % 24
321
+ if h == 0:
322
+ return f'{days}d'
323
+ return f'{days}d{h}h'
324
+ elif hours < 24 * 365 * 2:
325
+ return f'{days}d'
326
+ elif hours < 24 * 365 * 8:
327
+ dy = int(hours / 24) % 365
328
+ if dy == 0:
329
+ return f'{years}y'
330
+ return f'{years}y{dy}d'
331
+ return f'{years}y'
332
+
333
+
359
334
  def follow_logs(
360
335
  file: TextIO,
361
336
  *,
@@ -0,0 +1,22 @@
1
+ """Utility functions for performance monitoring."""
2
+ import os
3
+ from typing import Optional
4
+
5
+ from sky import sky_logging
6
+ from sky.skylet import constants
7
+
8
+ logger = sky_logging.init_logger(__name__)
9
+
10
+
11
+ def get_loop_lag_threshold() -> Optional[float]:
12
+ """Get the loop lag threshold from the environment variable."""
13
+ lag_threshold = os.getenv(constants.ENV_VAR_LOOP_LAG_THRESHOLD_MS, None)
14
+ if lag_threshold is not None:
15
+ try:
16
+ return float(lag_threshold) / 1000.0
17
+ except ValueError:
18
+ logger.warning(
19
+ f'Invalid value for {constants.ENV_VAR_LOOP_LAG_THRESHOLD_MS}:'
20
+ f' {lag_threshold}')
21
+ return None
22
+ return None
@@ -0,0 +1,298 @@
1
+ """Resource checking utilities for finding active clusters and managed jobs."""
2
+
3
+ import concurrent.futures
4
+ from typing import Any, Callable, Dict, List, Tuple
5
+
6
+ from sky import exceptions
7
+ from sky import global_user_state
8
+ from sky import sky_logging
9
+ from sky.skylet import constants
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ def check_no_active_resources_for_users(
15
+ user_operations: List[Tuple[str, str]]) -> None:
16
+ """Check if users have active clusters or managed jobs.
17
+
18
+ Args:
19
+ user_operations: List of tuples (user_id, operation) where
20
+ operation is 'update' or 'delete'.
21
+
22
+ Raises:
23
+ ValueError: If any user has active clusters or managed jobs.
24
+ The error message will include all users with issues.
25
+ """
26
+ if not user_operations:
27
+ return
28
+
29
+ def filter_by_user(user_id: str):
30
+ return lambda resource: resource.get('user_hash') == user_id
31
+
32
+ _check_active_resources(user_operations, filter_by_user, 'user')
33
+
34
+
35
+ def check_no_active_resources_for_workspaces(
36
+ workspace_operations: List[Tuple[str, str]]) -> None:
37
+ """Check if workspaces have active clusters or managed jobs.
38
+
39
+ Args:
40
+ workspace_operations: List of tuples (workspace_name, operation) where
41
+ operation is 'update' or 'delete'.
42
+
43
+ Raises:
44
+ ValueError: If any workspace has active clusters or managed jobs.
45
+ The error message will include all workspaces with issues.
46
+ """
47
+ if not workspace_operations:
48
+ return
49
+
50
+ def filter_by_workspace(workspace_name: str):
51
+ return lambda resource: (resource.get(
52
+ 'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) == workspace_name
53
+ )
54
+
55
+ _check_active_resources(workspace_operations, filter_by_workspace,
56
+ 'workspace')
57
+
58
+
59
+ def _check_active_resources(resource_operations: List[Tuple[str, str]],
60
+ filter_factory: Callable[[str],
61
+ Callable[[Dict[str, Any]],
62
+ bool]],
63
+ resource_type: str) -> None:
64
+ """Check if resource entities have active clusters or managed jobs.
65
+
66
+ Args:
67
+ resource_operations: List of tuples (resource_name, operation) where
68
+ operation is 'update' or 'delete'.
69
+ filter_factory: Function that takes a resource_name and returns a filter
70
+ function for clusters/jobs.
71
+ resource_type: Type of resource being checked ('user' or 'workspace').
72
+
73
+ Raises:
74
+ ValueError: If any resource has active clusters or managed jobs.
75
+ """
76
+
77
+ all_clusters, all_managed_jobs = _get_active_resources()
78
+
79
+ # Collect all error messages instead of raising immediately
80
+ error_messages = []
81
+
82
+ # Check each resource against the fetched data
83
+ for resource_name, operation in resource_operations:
84
+ resource_filter = filter_factory(resource_name)
85
+
86
+ # Filter clusters for this resource
87
+ resource_clusters = [
88
+ cluster for cluster in all_clusters if resource_filter(cluster)
89
+ ]
90
+
91
+ # Filter managed jobs for this resource
92
+ resource_active_jobs = [
93
+ job for job in all_managed_jobs if resource_filter(job)
94
+ ]
95
+
96
+ # Collect error messages for this resource
97
+ resource_errors = []
98
+
99
+ if resource_clusters:
100
+ active_cluster_names = [
101
+ cluster['name'] for cluster in resource_clusters
102
+ ]
103
+ cluster_list = ', '.join(active_cluster_names)
104
+ resource_errors.append(
105
+ f'{len(resource_clusters)} active cluster(s): {cluster_list}')
106
+
107
+ if resource_active_jobs:
108
+ job_names = [str(job['job_id']) for job in resource_active_jobs]
109
+ job_list = ', '.join(job_names)
110
+ resource_errors.append(
111
+ f'{len(resource_active_jobs)} active managed job(s): '
112
+ f'{job_list}')
113
+
114
+ # If this resource has issues, add to overall error messages
115
+ if resource_errors:
116
+ resource_error_summary = ' and '.join(resource_errors)
117
+ if resource_type == 'user':
118
+ # resource_name is user_id
119
+ user_info = global_user_state.get_user(resource_name)
120
+ if user_info and user_info.name:
121
+ resource_name = user_info.name
122
+ error_messages.append(
123
+ f'Cannot {operation} {resource_type} {resource_name!r} '
124
+ f'because it has {resource_error_summary}.')
125
+
126
+ # If we collected any errors, raise them all together
127
+ if error_messages:
128
+ if len(error_messages) == 1:
129
+ # Single resource error
130
+ full_message = error_messages[
131
+ 0] + ' Please terminate these resources first.'
132
+ else:
133
+ # Multiple resource errors
134
+ full_message = (f'Cannot proceed due to active resources in '
135
+ f'{len(error_messages)} {resource_type}(s):\n' +
136
+ '\n'.join(f'• {msg}' for msg in error_messages) +
137
+ '\nPlease terminate these resources first.')
138
+ raise ValueError(full_message)
139
+
140
+
141
+ def check_users_workspaces_active_resources(
142
+ user_ids: List[str],
143
+ workspace_names: List[str]) -> Tuple[str, List[str], Dict[str, str]]:
144
+ """Check if all the active clusters or managed jobs in workspaces
145
+ belong to the user_ids. If not, return the error message.
146
+
147
+ Args:
148
+ user_ids: List of user_id.
149
+ workspace_names: List of workspace_name.
150
+
151
+ Returns:
152
+ resource_error_summary: str
153
+ missed_users_names: List[str]
154
+ missed_user_dict: Dict[str, str]
155
+ """
156
+ all_clusters, all_managed_jobs = _get_active_resources_for_workspaces(
157
+ workspace_names)
158
+ resource_errors = []
159
+ missed_users = set()
160
+ active_cluster_names = []
161
+ active_job_names = []
162
+ # Check clusters
163
+ if all_clusters:
164
+ for cluster in all_clusters:
165
+ user_hash = cluster.get('user_hash')
166
+ if user_hash and user_hash not in user_ids:
167
+ missed_users.add(user_hash)
168
+ active_cluster_names.append(cluster['name'])
169
+ if active_cluster_names:
170
+ cluster_list = ', '.join(active_cluster_names)
171
+ resource_errors.append(
172
+ f'{len(active_cluster_names)} active cluster(s):'
173
+ f' {cluster_list}')
174
+
175
+ # Check managed jobs
176
+ if all_managed_jobs:
177
+ for job in all_managed_jobs:
178
+ user_hash = job.get('user_hash')
179
+ if user_hash and user_hash not in user_ids:
180
+ missed_users.add(user_hash)
181
+ active_job_names.append(str(job['job_id']))
182
+ if active_job_names:
183
+ job_list = ', '.join(active_job_names)
184
+ resource_errors.append(f'{len(active_job_names)} active'
185
+ f' managed job(s): {job_list}')
186
+
187
+ resource_error_summary = ''
188
+ if resource_errors:
189
+ resource_error_summary = ' and '.join(resource_errors)
190
+ missed_users_names = []
191
+ missed_user_dict = {}
192
+ if missed_users:
193
+ all_users = global_user_state.get_all_users()
194
+ for user in all_users:
195
+ if user.id in missed_users:
196
+ missed_users_names.append(user.name if user.name else user.id)
197
+ missed_user_dict[user.id] = user.name if user.name else user.id
198
+ return resource_error_summary, missed_users_names, missed_user_dict
199
+
200
+
201
+ def _get_active_resources_for_workspaces(
202
+ workspace_names: List[str]
203
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
204
+ """Get active clusters or managed jobs for workspaces.
205
+
206
+ Args:
207
+ workspace_names: List of workspace_name.
208
+
209
+ Returns:
210
+ all_clusters: List[Dict[str, Any]]
211
+ all_managed_jobs: List[Dict[str, Any]]
212
+ """
213
+ if not workspace_names:
214
+ return [], []
215
+
216
+ def filter_by_workspaces(workspace_names: List[str]):
217
+ return lambda resource: (resource.get(
218
+ 'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
219
+ workspace_names)
220
+
221
+ return _get_active_resources_by_names(workspace_names, filter_by_workspaces)
222
+
223
+
224
+ def _get_active_resources_by_names(
225
+ resource_names: List[str],
226
+ filter_factory: Callable[[List[str]], Callable[[Dict[str, Any]], bool]]
227
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
228
+ """Get active clusters or managed jobs.
229
+
230
+ Args:
231
+ resource_names: List of resource_name.
232
+ filter_factory: Function that takes a resource_name and returns a filter
233
+ function for clusters/jobs.
234
+
235
+ Returns:
236
+ all_clusters: List[Dict[str, Any]]
237
+ all_managed_jobs: List[Dict[str, Any]]
238
+ """
239
+
240
+ all_clusters, all_managed_jobs = _get_active_resources()
241
+
242
+ resource_clusters = []
243
+ resource_active_jobs = []
244
+
245
+ # Check each resource against the fetched data,
246
+ # return the active resources by names
247
+ resource_filter = filter_factory(resource_names)
248
+
249
+ # Filter clusters for this resource
250
+ if all_clusters:
251
+ resource_clusters = [
252
+ cluster for cluster in all_clusters if resource_filter(cluster)
253
+ ]
254
+
255
+ # Filter managed jobs for this resource
256
+ if all_managed_jobs:
257
+ resource_active_jobs = [
258
+ job for job in all_managed_jobs if resource_filter(job)
259
+ ]
260
+
261
+ return resource_clusters, resource_active_jobs
262
+
263
+
264
+ def _get_active_resources(
265
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
266
+ """Get all active clusters and managed jobs.
267
+
268
+ Returns:
269
+ all_clusters: List[Dict[str, Any]]
270
+ all_managed_jobs: List[Dict[str, Any]]
271
+ """
272
+
273
+ def get_all_clusters() -> List[Dict[str, Any]]:
274
+ return global_user_state.get_clusters()
275
+
276
+ def get_all_managed_jobs() -> List[Dict[str, Any]]:
277
+ # pylint: disable=import-outside-toplevel
278
+ from sky.jobs.server import core as managed_jobs_core
279
+ try:
280
+ filtered_jobs, _, _, _ = managed_jobs_core.queue_v2(
281
+ refresh=False,
282
+ skip_finished=True,
283
+ all_users=True,
284
+ fields=['job_id', 'user_hash', 'workspace'])
285
+ return filtered_jobs
286
+ except exceptions.ClusterNotUpError:
287
+ logger.warning('All jobs should be finished.')
288
+ return []
289
+
290
+ # Fetch both clusters and jobs in parallel
291
+ with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
292
+ clusters_future = executor.submit(get_all_clusters)
293
+ jobs_future = executor.submit(get_all_managed_jobs)
294
+
295
+ all_clusters = clusters_future.result()
296
+ all_managed_jobs = jobs_future.result()
297
+
298
+ return all_clusters, all_managed_jobs