skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/utils/log_utils.py CHANGED
@@ -47,13 +47,16 @@ class RayUpLineProcessor(LineProcessor):
47
47
  RUNTIME_SETUP = 1
48
48
  PULLING_DOCKER_IMAGES = 2
49
49
 
50
- def __init__(self, log_path: str):
50
+ def __init__(self, log_path: str, cluster_name: Optional[str] = None):
51
51
  self.log_path = log_path
52
+ self.cluster_name = cluster_name
52
53
 
53
54
  def __enter__(self) -> None:
54
55
  self.state = self.ProvisionStatus.LAUNCH
55
56
  self.status_display = rich_utils.safe_status(
56
- ux_utils.spinner_message('Launching', self.log_path))
57
+ ux_utils.spinner_message('Launching',
58
+ self.log_path,
59
+ cluster_name=self.cluster_name))
57
60
  self.status_display.start()
58
61
 
59
62
  def process_line(self, log_line: str) -> None:
@@ -62,19 +65,25 @@ class RayUpLineProcessor(LineProcessor):
62
65
  logger.info(' Head VM is up.')
63
66
  self.status_display.update(
64
67
  ux_utils.spinner_message(
65
- 'Launching - Preparing SkyPilot runtime', self.log_path))
68
+ 'Launching - Preparing SkyPilot runtime',
69
+ self.log_path,
70
+ cluster_name=self.cluster_name))
66
71
  self.state = self.ProvisionStatus.RUNTIME_SETUP
67
72
  if ('Pulling from' in log_line and
68
73
  self.state == self.ProvisionStatus.RUNTIME_SETUP):
69
74
  self.status_display.update(
70
75
  ux_utils.spinner_message(
71
- 'Launching - Initializing docker container', self.log_path))
76
+ 'Launching - Initializing docker container',
77
+ self.log_path,
78
+ cluster_name=self.cluster_name))
72
79
  self.state = self.ProvisionStatus.PULLING_DOCKER_IMAGES
73
80
  if ('Status: Downloaded newer image' in log_line and
74
81
  self.state == self.ProvisionStatus.PULLING_DOCKER_IMAGES):
75
82
  self.status_display.update(
76
83
  ux_utils.spinner_message(
77
- 'Launching - Preparing SkyPilot runtime', self.log_path))
84
+ 'Launching - Preparing SkyPilot runtime',
85
+ self.log_path,
86
+ cluster_name=self.cluster_name))
78
87
  self.state = self.ProvisionStatus.RUNTIME_SETUP
79
88
 
80
89
  def __exit__(self, except_type: Optional[Type[BaseException]],
@@ -190,7 +199,7 @@ class SkyLocalUpLineProcessor(LineProcessor):
190
199
 
191
200
 
192
201
  class SkyRemoteUpLineProcessor(LineProcessor):
193
- """A processor for deploy_remote_cluster.sh log lines."""
202
+ """A processor for deploy_remote_cluster.py log lines."""
194
203
 
195
204
  def __init__(self, log_path: str, is_local: bool):
196
205
  self.log_path = log_path
@@ -291,6 +300,223 @@ class SkyRemoteUpLineProcessor(LineProcessor):
291
300
  self.status_display.stop()
292
301
 
293
302
 
303
+ class SkySSHUpLineProcessor(LineProcessor):
304
+ """A processor for deploy_remote_cluster.py log lines for SSH clusters"""
305
+
306
+ def __init__(self, log_path: str, is_local: bool):
307
+ self.log_path = log_path
308
+ self.is_local = is_local
309
+ self.current_cluster: Optional[str] = None
310
+ self.is_cleanup_mode = False
311
+
312
+ def __enter__(self) -> None:
313
+ status = rich_utils.safe_status(
314
+ ux_utils.spinner_message('Preparing to set up SSH Node Pools',
315
+ log_path=self.log_path,
316
+ is_local=self.is_local))
317
+ self.status_display = status
318
+ self.status_display.start()
319
+
320
+ def process_line(self, log_line: str) -> None:
321
+ # Detect cleanup mode
322
+ if 'SKYPILOT_CLEANUP_MODE:' in log_line:
323
+ self.is_cleanup_mode = True
324
+ if self.current_cluster:
325
+ self.status_display.update(
326
+ ux_utils.spinner_message(
327
+ f'Cleaning up Node Pool: \\[{self.current_cluster}]',
328
+ log_path=self.log_path,
329
+ is_local=self.is_local))
330
+
331
+ # Cluster detection message
332
+ if 'SKYPILOT_CLUSTER_INFO:' in log_line:
333
+ clusters_part = log_line.split('SKYPILOT_CLUSTER_INFO:',
334
+ 1)[1].strip()
335
+ if clusters_part.startswith('Found'):
336
+ logger.info(f'{colorama.Style.RESET_ALL}'
337
+ f'{colorama.Fore.CYAN}{clusters_part}'
338
+ f'{colorama.Style.RESET_ALL}')
339
+
340
+ # Current cluster being operated on
341
+ if 'SKYPILOT_CURRENT_CLUSTER:' in log_line:
342
+ self.current_cluster = log_line.split('SKYPILOT_CURRENT_CLUSTER:',
343
+ 1)[1].strip()
344
+
345
+ if self.is_cleanup_mode:
346
+ self.status_display.update(
347
+ ux_utils.spinner_message(
348
+ f'Cleaning up Node Pool: {self.current_cluster}',
349
+ log_path=self.log_path,
350
+ is_local=self.is_local))
351
+ logger.info(f'{colorama.Fore.CYAN}\nCleaning up Node Pool: '
352
+ f'{self.current_cluster}{colorama.Style.RESET_ALL}')
353
+ else:
354
+ self.status_display.update(
355
+ ux_utils.spinner_message(
356
+ f'Deploying SkyPilot \\[{self.current_cluster}]',
357
+ log_path=self.log_path,
358
+ is_local=self.is_local))
359
+ logger.info(f'{colorama.Style.RESET_ALL}'
360
+ f'{colorama.Fore.CYAN}\nSetting up Node Pool: '
361
+ f'{self.current_cluster}{colorama.Style.RESET_ALL}')
362
+
363
+ # Handle cluster completion marker
364
+ if 'SKYPILOT_CLUSTER_COMPLETED:' in log_line:
365
+ if self.is_cleanup_mode:
366
+ logger.info(
367
+ f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
368
+ f'✔ Node Pool {self.current_cluster} cleaned up '
369
+ f'successfully.{colorama.Style.RESET_ALL}')
370
+ else:
371
+ logger.info(
372
+ f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
373
+ f'✔ Node Pool {self.current_cluster} deployed successfully.'
374
+ f'{colorama.Style.RESET_ALL}')
375
+
376
+ # Pre-flight checks
377
+ if 'Checking SSH connection to head node' in log_line:
378
+ logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
379
+ 'Checking SSH connection to head node...'
380
+ f'{colorama.Style.RESET_ALL}')
381
+
382
+ if log_line.startswith('SSH connection successful'):
383
+ node_name = log_line.split('(')[-1].split(')')[0]
384
+ logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
385
+ '✔ SSH connection established to head node '
386
+ f'{node_name}.{colorama.Style.RESET_ALL}')
387
+
388
+ # Kubernetes installation steps
389
+ if 'Deploying Kubernetes on head node' in log_line:
390
+ current_cluster_str = f' \\[{self.current_cluster}]' if (
391
+ self.current_cluster) else ''
392
+ self.status_display.update(
393
+ ux_utils.spinner_message(
394
+ 'Deploying SkyPilot runtime on head node'
395
+ f'{current_cluster_str}',
396
+ log_path=self.log_path,
397
+ is_local=self.is_local))
398
+
399
+ if 'K3s deployed on head node' in log_line:
400
+ node_name = log_line.split('(')[-1].split(')')[0]
401
+ logger.info(
402
+ f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
403
+ f'✔ SkyPilot runtime successfully deployed on head node '
404
+ f'{node_name}.{colorama.Style.RESET_ALL}')
405
+
406
+ # Worker nodes
407
+ if 'Deploying Kubernetes on worker node' in log_line:
408
+ self.status_display.update(
409
+ ux_utils.spinner_message(
410
+ 'Deploying SkyPilot runtime on worker nodes' +
411
+ (f' \\[{self.current_cluster}]'
412
+ if self.current_cluster else ''),
413
+ log_path=self.log_path,
414
+ is_local=self.is_local))
415
+
416
+ if 'Kubernetes deployed on worker node' in log_line:
417
+ node_name = log_line.split('(')[-1].split(')')[0]
418
+ logger.info(
419
+ f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
420
+ '✔ SkyPilot runtime successfully deployed on worker node '
421
+ f'{node_name}.{colorama.Style.RESET_ALL}')
422
+
423
+ if 'Failed to deploy K3s on worker node' in log_line:
424
+ node_name = log_line.split('(')[-1].split(')')[0]
425
+ logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
426
+ '✗ Failed to deploy K3s on worker node '
427
+ f'{node_name}.{colorama.Style.RESET_ALL}')
428
+
429
+ # Cluster configuration
430
+ if 'Configuring local kubectl to connect to the cluster...' in log_line:
431
+ self.status_display.update(
432
+ ux_utils.spinner_message('Setting up SkyPilot configuration' +
433
+ (f' \\[{self.current_cluster}]'
434
+ if self.current_cluster else ''),
435
+ log_path=self.log_path,
436
+ is_local=self.is_local))
437
+
438
+ if 'kubectl configured to connect to the cluster.' in log_line:
439
+ logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
440
+ '✔ SkyPilot configuration complete.'
441
+ f'{colorama.Style.RESET_ALL}')
442
+
443
+ # GPU operator installation
444
+ if 'Installing Nvidia GPU Operator...' in log_line:
445
+ self.status_display.update(
446
+ ux_utils.spinner_message('Configuring Nvidia GPUs' +
447
+ (f' \\[{self.current_cluster}]'
448
+ if self.current_cluster else ''),
449
+ log_path=self.log_path,
450
+ is_local=self.is_local))
451
+
452
+ if 'GPU Operator installed.' in log_line:
453
+ logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
454
+ '✔ Nvidia GPUs configured successfully.'
455
+ f'{colorama.Style.RESET_ALL}')
456
+
457
+ # Cleanup steps
458
+ if 'Cleaning up head node' in log_line:
459
+ self.status_display.update(
460
+ ux_utils.spinner_message('Cleaning up head node' +
461
+ (f' \\[{self.current_cluster}]'
462
+ if self.current_cluster else ''),
463
+ log_path=self.log_path,
464
+ is_local=self.is_local))
465
+
466
+ if 'Cleaning up worker node' in log_line:
467
+ self.status_display.update(
468
+ ux_utils.spinner_message('Cleaning up worker nodes' +
469
+ (f' \\[{self.current_cluster}]'
470
+ if self.current_cluster else ''),
471
+ log_path=self.log_path,
472
+ is_local=self.is_local))
473
+
474
+ # Handle node cleanup success messages
475
+ if 'Node' in log_line and 'cleaned up successfully' in log_line:
476
+ logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
477
+ f'{log_line.strip()}{colorama.Style.RESET_ALL}')
478
+
479
+ if 'Node' in log_line and 'Failed to clean up' in log_line:
480
+ logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
481
+ f'{log_line.strip()}{colorama.Style.RESET_ALL}')
482
+
483
+ if 'Failed to clean up worker node' in log_line:
484
+ logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
485
+ f'{log_line.strip()}{colorama.Style.RESET_ALL}')
486
+
487
+ # Final status for the cluster deployment
488
+ if 'Cluster deployment completed.' in log_line:
489
+ logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.GREEN}'
490
+ '✔ SkyPilot runtime is up.'
491
+ f'{colorama.Style.RESET_ALL}')
492
+
493
+ if 'Failed to deploy Kubernetes on the following nodes:' in log_line:
494
+ logger.info(log_line.strip())
495
+
496
+ if 'already exists in history. ' in log_line:
497
+ node_name = log_line.split('(')[-1].split(')')[0]
498
+ logger.info(f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.YELLOW}'
499
+ '✔ SkyPilot runtime already deployed on worker node '
500
+ f'{node_name}. Skipping.{colorama.Style.RESET_ALL}')
501
+
502
+ if 'Failed to setup TCP forwarding on head node' in log_line:
503
+ node_name = log_line.split('(')[-1].split(')')[0]
504
+ logger.info(
505
+ f'{ux_utils.INDENT_SYMBOL}{colorama.Fore.RED}'
506
+ f'✗ Failed to setup TCP forwarding on head node {node_name}.'
507
+ f'{colorama.Style.RESET_ALL}')
508
+
509
+ if 'Error in deploying SSH Target' in log_line:
510
+ logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.RED}'
511
+ f'{log_line.strip()}{colorama.Style.RESET_ALL}')
512
+
513
+ def __exit__(self, except_type: Optional[Type[BaseException]],
514
+ except_value: Optional[BaseException],
515
+ traceback: Optional[types.TracebackType]) -> None:
516
+ del except_type, except_value, traceback # unused
517
+ self.status_display.stop()
518
+
519
+
294
520
  def create_table(field_names: List[str], **kwargs) -> prettytable.PrettyTable:
295
521
  """Creates table with default style."""
296
522
  border = kwargs.pop('border', False)
@@ -356,6 +582,74 @@ def readable_time_duration(start: Optional[float],
356
582
  return diff
357
583
 
358
584
 
585
+ def human_duration(start: int, end: Optional[int] = None) -> str:
586
+ """Calculates the time elapsed between two timestamps and returns
587
+ it as a human-readable string, similar to Kubernetes' duration format.
588
+
589
+ Args:
590
+ start: The start time as a Unix timestamp (seconds since epoch).
591
+ end: The end time as a Unix timestamp (seconds since epoch).
592
+ If None, current time is used.
593
+
594
+ Returns:
595
+ A string representing the duration, e.g., "2d3h", "15m", "30s".
596
+ Returns "0s" for zero, negative durations, or if the timestamp
597
+ is invalid.
598
+ """
599
+ if not start or start <= 0:
600
+ return '0s'
601
+
602
+ if end is None:
603
+ end = int(time.time())
604
+ duration_seconds = end - start
605
+
606
+ units = {
607
+ 'y': 365 * 24 * 60 * 60,
608
+ 'd': 60 * 60 * 24,
609
+ 'h': 60 * 60,
610
+ 'm': 60,
611
+ 's': 1,
612
+ }
613
+
614
+ if duration_seconds <= 0:
615
+ return '0s'
616
+ elif duration_seconds < 60 * 2:
617
+ return f'{duration_seconds}s'
618
+
619
+ minutes = int(duration_seconds / units['m'])
620
+ if minutes < 10:
621
+ s = int(duration_seconds / units['s']) % 60
622
+ if s == 0:
623
+ return f'{minutes}m'
624
+ return f'{minutes}m{s}s'
625
+ elif minutes < 60 * 3:
626
+ return f'{minutes}m'
627
+
628
+ hours = int(duration_seconds / units['h'])
629
+ days = int(hours / 24)
630
+ years = int(hours / 24 / 365)
631
+ if hours < 8:
632
+ m = int(duration_seconds / units['m']) % 60
633
+ if m == 0:
634
+ return f'{hours}h'
635
+ return f'{hours}h{m}m'
636
+ elif hours < 48:
637
+ return f'{hours}h'
638
+ elif hours < 24 * 8:
639
+ h = hours % 24
640
+ if h == 0:
641
+ return f'{days}d'
642
+ return f'{days}d{h}h'
643
+ elif hours < 24 * 365 * 2:
644
+ return f'{days}d'
645
+ elif hours < 24 * 365 * 8:
646
+ dy = int(hours / 24) % 365
647
+ if dy == 0:
648
+ return f'{years}y'
649
+ return f'{years}y{dy}d'
650
+ return f'{years}y'
651
+
652
+
359
653
  def follow_logs(
360
654
  file: TextIO,
361
655
  *,
@@ -0,0 +1,22 @@
1
+ """Utility functions for performance monitoring."""
2
+ import os
3
+ from typing import Optional
4
+
5
+ from sky import sky_logging
6
+ from sky.skylet import constants
7
+
8
+ logger = sky_logging.init_logger(__name__)
9
+
10
+
11
+ def get_loop_lag_threshold() -> Optional[float]:
12
+ """Get the loop lag threshold from the environment variable."""
13
+ lag_threshold = os.getenv(constants.ENV_VAR_LOOP_LAG_THRESHOLD_MS, None)
14
+ if lag_threshold is not None:
15
+ try:
16
+ return float(lag_threshold) / 1000.0
17
+ except ValueError:
18
+ logger.warning(
19
+ f'Invalid value for {constants.ENV_VAR_LOOP_LAG_THRESHOLD_MS}:'
20
+ f' {lag_threshold}')
21
+ return None
22
+ return None
@@ -0,0 +1,298 @@
1
+ """Resource checking utilities for finding active clusters and managed jobs."""
2
+
3
+ import concurrent.futures
4
+ from typing import Any, Callable, Dict, List, Tuple
5
+
6
+ from sky import exceptions
7
+ from sky import global_user_state
8
+ from sky import sky_logging
9
+ from sky.skylet import constants
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ def check_no_active_resources_for_users(
15
+ user_operations: List[Tuple[str, str]]) -> None:
16
+ """Check if users have active clusters or managed jobs.
17
+
18
+ Args:
19
+ user_operations: List of tuples (user_id, operation) where
20
+ operation is 'update' or 'delete'.
21
+
22
+ Raises:
23
+ ValueError: If any user has active clusters or managed jobs.
24
+ The error message will include all users with issues.
25
+ """
26
+ if not user_operations:
27
+ return
28
+
29
+ def filter_by_user(user_id: str):
30
+ return lambda resource: resource.get('user_hash') == user_id
31
+
32
+ _check_active_resources(user_operations, filter_by_user, 'user')
33
+
34
+
35
+ def check_no_active_resources_for_workspaces(
36
+ workspace_operations: List[Tuple[str, str]]) -> None:
37
+ """Check if workspaces have active clusters or managed jobs.
38
+
39
+ Args:
40
+ workspace_operations: List of tuples (workspace_name, operation) where
41
+ operation is 'update' or 'delete'.
42
+
43
+ Raises:
44
+ ValueError: If any workspace has active clusters or managed jobs.
45
+ The error message will include all workspaces with issues.
46
+ """
47
+ if not workspace_operations:
48
+ return
49
+
50
+ def filter_by_workspace(workspace_name: str):
51
+ return lambda resource: (resource.get(
52
+ 'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) == workspace_name
53
+ )
54
+
55
+ _check_active_resources(workspace_operations, filter_by_workspace,
56
+ 'workspace')
57
+
58
+
59
+ def _check_active_resources(resource_operations: List[Tuple[str, str]],
60
+ filter_factory: Callable[[str],
61
+ Callable[[Dict[str, Any]],
62
+ bool]],
63
+ resource_type: str) -> None:
64
+ """Check if resource entities have active clusters or managed jobs.
65
+
66
+ Args:
67
+ resource_operations: List of tuples (resource_name, operation) where
68
+ operation is 'update' or 'delete'.
69
+ filter_factory: Function that takes a resource_name and returns a filter
70
+ function for clusters/jobs.
71
+ resource_type: Type of resource being checked ('user' or 'workspace').
72
+
73
+ Raises:
74
+ ValueError: If any resource has active clusters or managed jobs.
75
+ """
76
+
77
+ all_clusters, all_managed_jobs = _get_active_resources()
78
+
79
+ # Collect all error messages instead of raising immediately
80
+ error_messages = []
81
+
82
+ # Check each resource against the fetched data
83
+ for resource_name, operation in resource_operations:
84
+ resource_filter = filter_factory(resource_name)
85
+
86
+ # Filter clusters for this resource
87
+ resource_clusters = [
88
+ cluster for cluster in all_clusters if resource_filter(cluster)
89
+ ]
90
+
91
+ # Filter managed jobs for this resource
92
+ resource_active_jobs = [
93
+ job for job in all_managed_jobs if resource_filter(job)
94
+ ]
95
+
96
+ # Collect error messages for this resource
97
+ resource_errors = []
98
+
99
+ if resource_clusters:
100
+ active_cluster_names = [
101
+ cluster['name'] for cluster in resource_clusters
102
+ ]
103
+ cluster_list = ', '.join(active_cluster_names)
104
+ resource_errors.append(
105
+ f'{len(resource_clusters)} active cluster(s): {cluster_list}')
106
+
107
+ if resource_active_jobs:
108
+ job_names = [str(job['job_id']) for job in resource_active_jobs]
109
+ job_list = ', '.join(job_names)
110
+ resource_errors.append(
111
+ f'{len(resource_active_jobs)} active managed job(s): '
112
+ f'{job_list}')
113
+
114
+ # If this resource has issues, add to overall error messages
115
+ if resource_errors:
116
+ resource_error_summary = ' and '.join(resource_errors)
117
+ if resource_type == 'user':
118
+ # resource_name is user_id
119
+ user_info = global_user_state.get_user(resource_name)
120
+ if user_info and user_info.name:
121
+ resource_name = user_info.name
122
+ error_messages.append(
123
+ f'Cannot {operation} {resource_type} {resource_name!r} '
124
+ f'because it has {resource_error_summary}.')
125
+
126
+ # If we collected any errors, raise them all together
127
+ if error_messages:
128
+ if len(error_messages) == 1:
129
+ # Single resource error
130
+ full_message = error_messages[
131
+ 0] + ' Please terminate these resources first.'
132
+ else:
133
+ # Multiple resource errors
134
+ full_message = (f'Cannot proceed due to active resources in '
135
+ f'{len(error_messages)} {resource_type}(s):\n' +
136
+ '\n'.join(f'• {msg}' for msg in error_messages) +
137
+ '\nPlease terminate these resources first.')
138
+ raise ValueError(full_message)
139
+
140
+
141
+ def check_users_workspaces_active_resources(
142
+ user_ids: List[str],
143
+ workspace_names: List[str]) -> Tuple[str, List[str], Dict[str, str]]:
144
+ """Check if all the active clusters or managed jobs in workspaces
145
+ belong to the user_ids. If not, return the error message.
146
+
147
+ Args:
148
+ user_ids: List of user_id.
149
+ workspace_names: List of workspace_name.
150
+
151
+ Returns:
152
+ resource_error_summary: str
153
+ missed_users_names: List[str]
154
+ missed_user_dict: Dict[str, str]
155
+ """
156
+ all_clusters, all_managed_jobs = _get_active_resources_for_workspaces(
157
+ workspace_names)
158
+ resource_errors = []
159
+ missed_users = set()
160
+ active_cluster_names = []
161
+ active_job_names = []
162
+ # Check clusters
163
+ if all_clusters:
164
+ for cluster in all_clusters:
165
+ user_hash = cluster.get('user_hash')
166
+ if user_hash and user_hash not in user_ids:
167
+ missed_users.add(user_hash)
168
+ active_cluster_names.append(cluster['name'])
169
+ if active_cluster_names:
170
+ cluster_list = ', '.join(active_cluster_names)
171
+ resource_errors.append(
172
+ f'{len(active_cluster_names)} active cluster(s):'
173
+ f' {cluster_list}')
174
+
175
+ # Check managed jobs
176
+ if all_managed_jobs:
177
+ for job in all_managed_jobs:
178
+ user_hash = job.get('user_hash')
179
+ if user_hash and user_hash not in user_ids:
180
+ missed_users.add(user_hash)
181
+ active_job_names.append(str(job['job_id']))
182
+ if active_job_names:
183
+ job_list = ', '.join(active_job_names)
184
+ resource_errors.append(f'{len(active_job_names)} active'
185
+ f' managed job(s): {job_list}')
186
+
187
+ resource_error_summary = ''
188
+ if resource_errors:
189
+ resource_error_summary = ' and '.join(resource_errors)
190
+ missed_users_names = []
191
+ missed_user_dict = {}
192
+ if missed_users:
193
+ all_users = global_user_state.get_all_users()
194
+ for user in all_users:
195
+ if user.id in missed_users:
196
+ missed_users_names.append(user.name if user.name else user.id)
197
+ missed_user_dict[user.id] = user.name if user.name else user.id
198
+ return resource_error_summary, missed_users_names, missed_user_dict
199
+
200
+
201
+ def _get_active_resources_for_workspaces(
202
+ workspace_names: List[str]
203
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
204
+ """Get active clusters or managed jobs for workspaces.
205
+
206
+ Args:
207
+ workspace_names: List of workspace_name.
208
+
209
+ Returns:
210
+ all_clusters: List[Dict[str, Any]]
211
+ all_managed_jobs: List[Dict[str, Any]]
212
+ """
213
+ if not workspace_names:
214
+ return [], []
215
+
216
+ def filter_by_workspaces(workspace_names: List[str]):
217
+ return lambda resource: (resource.get(
218
+ 'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
219
+ workspace_names)
220
+
221
+ return _get_active_resources_by_names(workspace_names, filter_by_workspaces)
222
+
223
+
224
+ def _get_active_resources_by_names(
225
+ resource_names: List[str],
226
+ filter_factory: Callable[[List[str]], Callable[[Dict[str, Any]], bool]]
227
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
228
+ """Get active clusters or managed jobs.
229
+
230
+ Args:
231
+ resource_names: List of resource_name.
232
+ filter_factory: Function that takes a resource_name and returns a filter
233
+ function for clusters/jobs.
234
+
235
+ Returns:
236
+ all_clusters: List[Dict[str, Any]]
237
+ all_managed_jobs: List[Dict[str, Any]]
238
+ """
239
+
240
+ all_clusters, all_managed_jobs = _get_active_resources()
241
+
242
+ resource_clusters = []
243
+ resource_active_jobs = []
244
+
245
+ # Check each resource against the fetched data,
246
+ # return the active resources by names
247
+ resource_filter = filter_factory(resource_names)
248
+
249
+ # Filter clusters for this resource
250
+ if all_clusters:
251
+ resource_clusters = [
252
+ cluster for cluster in all_clusters if resource_filter(cluster)
253
+ ]
254
+
255
+ # Filter managed jobs for this resource
256
+ if all_managed_jobs:
257
+ resource_active_jobs = [
258
+ job for job in all_managed_jobs if resource_filter(job)
259
+ ]
260
+
261
+ return resource_clusters, resource_active_jobs
262
+
263
+
264
+ def _get_active_resources(
265
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
266
+ """Get all active clusters and managed jobs.
267
+
268
+ Returns:
269
+ all_clusters: List[Dict[str, Any]]
270
+ all_managed_jobs: List[Dict[str, Any]]
271
+ """
272
+
273
+ def get_all_clusters() -> List[Dict[str, Any]]:
274
+ return global_user_state.get_clusters()
275
+
276
+ def get_all_managed_jobs() -> List[Dict[str, Any]]:
277
+ # pylint: disable=import-outside-toplevel
278
+ from sky.jobs.server import core as managed_jobs_core
279
+ try:
280
+ filtered_jobs, _, _, _ = managed_jobs_core.queue_v2(
281
+ refresh=False,
282
+ skip_finished=True,
283
+ all_users=True,
284
+ fields=['job_id', 'user_hash', 'workspace'])
285
+ return filtered_jobs
286
+ except exceptions.ClusterNotUpError:
287
+ logger.warning('All jobs should be finished.')
288
+ return []
289
+
290
+ # Fetch both clusters and jobs in parallel
291
+ with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
292
+ clusters_future = executor.submit(get_all_clusters)
293
+ jobs_future = executor.submit(get_all_managed_jobs)
294
+
295
+ all_clusters = clusters_future.result()
296
+ all_managed_jobs = jobs_future.result()
297
+
298
+ return all_clusters, all_managed_jobs