skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -2,17 +2,17 @@
2
2
  import base64
3
3
  import pickle
4
4
  import typing
5
- from typing import Any, Dict, List, Optional, Tuple
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  from sky import jobs as managed_jobs
8
8
  from sky import models
9
- from sky.clouds.service_catalog import common
9
+ from sky.catalog import common
10
10
  from sky.data import storage
11
11
  from sky.provision.kubernetes import utils as kubernetes_utils
12
+ from sky.schemas.api import responses
12
13
  from sky.serve import serve_state
13
14
  from sky.server import constants as server_constants
14
15
  from sky.skylet import job_lib
15
- from sky.utils import registry
16
16
  from sky.utils import status_lib
17
17
 
18
18
  if typing.TYPE_CHECKING:
@@ -51,13 +51,19 @@ def default_decode_handler(return_value: Any) -> Any:
51
51
 
52
52
 
53
53
  @register_decoders('status')
54
- def decode_status(return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
54
+ def decode_status(
55
+ return_value: List[Dict[str, Any]]) -> List[responses.StatusResponse]:
55
56
  clusters = return_value
57
+ response = []
56
58
  for cluster in clusters:
57
- cluster['handle'] = decode_and_unpickle(cluster['handle'])
59
+ # handle may not always be present in the response.
60
+ if 'handle' in cluster and cluster['handle'] is not None:
61
+ cluster['handle'] = decode_and_unpickle(cluster['handle'])
58
62
  cluster['status'] = status_lib.ClusterStatus(cluster['status'])
59
-
60
- return clusters
63
+ if 'is_managed' not in cluster:
64
+ cluster['is_managed'] = False
65
+ response.append(responses.StatusResponse.model_validate(cluster))
66
+ return response
61
67
 
62
68
 
63
69
  @register_decoders('status_kubernetes')
@@ -66,7 +72,7 @@ def decode_status_kubernetes(
66
72
  List[Dict[str, Any]], Optional[str]]
67
73
  ) -> Tuple[List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
68
74
  List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
69
- List[Dict[str, Any]], Optional[str]]:
75
+ List[responses.ManagedJobRecord], Optional[str]]:
70
76
  (encoded_all_clusters, encoded_unmanaged_clusters, all_jobs,
71
77
  context) = return_value
72
78
  all_clusters = []
@@ -79,6 +85,7 @@ def decode_status_kubernetes(
79
85
  cluster['status'] = status_lib.ClusterStatus(cluster['status'])
80
86
  unmanaged_clusters.append(
81
87
  kubernetes_utils.KubernetesSkyPilotClusterInfoPayload(**cluster))
88
+ all_jobs = [responses.ManagedJobRecord(**job) for job in all_jobs]
82
89
  return all_clusters, unmanaged_clusters, all_jobs, context
83
90
 
84
91
 
@@ -95,24 +102,53 @@ def decode_start(return_value: str) -> 'backends.CloudVmRayResourceHandle':
95
102
 
96
103
 
97
104
  @register_decoders('queue')
98
- def decode_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
105
+ def decode_queue(return_value: List[dict],) -> List[responses.ClusterJobRecord]:
99
106
  jobs = return_value
100
107
  for job in jobs:
101
108
  job['status'] = job_lib.JobStatus(job['status'])
102
- return jobs
109
+ return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
103
110
 
104
111
 
105
112
  @register_decoders('jobs.queue')
106
113
  def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
107
- jobs = return_value
108
- for job in jobs:
109
- job['status'] = managed_jobs.ManagedJobStatus(job['status'])
110
- return jobs
111
-
112
-
113
- @register_decoders('serve.status')
114
- def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
115
- service_statuses = return_value
114
+ # To keep backward compatibility with v0.10.2
115
+ return decode_jobs_queue_v2(return_value)
116
+
117
+
118
+ @register_decoders('jobs.queue_v2')
119
+ def decode_jobs_queue_v2(
120
+ return_value
121
+ ) -> Union[Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int],
122
+ List[responses.ManagedJobRecord]]:
123
+ """Decode jobs queue response.
124
+
125
+ Supports legacy list, or a dict {jobs, total, total_no_filter,
126
+ status_counts}.
127
+
128
+ - Returns either list[job] or tuple(list[job], total, status_counts,
129
+ total_no_filter)
130
+ """
131
+ # Case 1: dict shape {jobs, total, total_no_filter, status_counts}
132
+ if isinstance(return_value, dict):
133
+ jobs = return_value.get('jobs', [])
134
+ total = return_value.get('total', len(jobs))
135
+ total_no_filter = return_value.get('total_no_filter', total)
136
+ status_counts = return_value.get('status_counts', {})
137
+ for job in jobs:
138
+ job['status'] = managed_jobs.ManagedJobStatus(job['status'])
139
+ jobs = [responses.ManagedJobRecord(**job) for job in jobs]
140
+ return jobs, total, status_counts, total_no_filter
141
+ else:
142
+ # Case 2: legacy list
143
+ jobs = return_value
144
+ for job in jobs:
145
+ job['status'] = managed_jobs.ManagedJobStatus(job['status'])
146
+ jobs = [responses.ManagedJobRecord(**job) for job in jobs]
147
+ return jobs
148
+
149
+
150
+ def _decode_serve_status(
151
+ service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
116
152
  for service_status in service_statuses:
117
153
  service_status['status'] = serve_state.ServiceStatus(
118
154
  service_status['status'])
@@ -123,6 +159,16 @@ def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
123
159
  return service_statuses
124
160
 
125
161
 
162
+ @register_decoders('serve.status')
163
+ def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
164
+ return _decode_serve_status(return_value)
165
+
166
+
167
+ @register_decoders('jobs.pool_status')
168
+ def decode_jobs_pool_status(return_value: List[dict]) -> List[Dict[str, Any]]:
169
+ return _decode_serve_status(return_value)
170
+
171
+
126
172
  @register_decoders('cost_report')
127
173
  def decode_cost_report(
128
174
  return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
@@ -135,16 +181,6 @@ def decode_cost_report(
135
181
  return return_value
136
182
 
137
183
 
138
- @register_decoders('enabled_clouds')
139
- def decode_enabled_clouds(return_value: List[str]) -> List['clouds.Cloud']:
140
- clouds = []
141
- for cloud_name in return_value:
142
- cloud = registry.CLOUD_REGISTRY.from_str(cloud_name)
143
- assert cloud is not None, return_value
144
- clouds.append(cloud)
145
- return clouds
146
-
147
-
148
184
  @register_decoders('list_accelerators')
149
185
  def decode_list_accelerators(
150
186
  return_value: Dict[str, List[List[Any]]]
@@ -160,14 +196,24 @@ def decode_list_accelerators(
160
196
 
161
197
  @register_decoders('storage_ls')
162
198
  def decode_storage_ls(
163
- return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
199
+ return_value: List[Dict[str, Any]]) -> List[responses.StorageRecord]:
164
200
  for storage_info in return_value:
165
201
  storage_info['status'] = status_lib.StorageStatus(
166
202
  storage_info['status'])
167
203
  storage_info['store'] = [
168
204
  storage.StoreType(store) for store in storage_info['store']
169
205
  ]
170
- return return_value
206
+ return [
207
+ responses.StorageRecord(**storage_info) for storage_info in return_value
208
+ ]
209
+
210
+
211
+ @register_decoders('volume_list')
212
+ def decode_volume_list(
213
+ return_value: List[Dict[str, Any]]) -> List[responses.VolumeRecord]:
214
+ return [
215
+ responses.VolumeRecord(**volume_info) for volume_info in return_value
216
+ ]
171
217
 
172
218
 
173
219
  @register_decoders('job_status')
@@ -190,3 +236,8 @@ def decode_job_status(
190
236
  def decode_kubernetes_node_info(
191
237
  return_value: Dict[str, Any]) -> models.KubernetesNodesInfo:
192
238
  return models.KubernetesNodesInfo.from_dict(return_value)
239
+
240
+
241
+ @register_decoders('endpoints')
242
+ def decode_endpoints(return_value: Dict[int, str]) -> Dict[int, str]:
243
+ return {int(k): v for k, v in return_value.items()}
@@ -6,14 +6,17 @@ import base64
6
6
  import dataclasses
7
7
  import pickle
8
8
  import typing
9
- from typing import Any, Dict, List, Optional, Tuple
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
10
10
 
11
+ from sky import models
12
+ from sky.catalog import common
13
+ from sky.schemas.api import responses
11
14
  from sky.server import constants as server_constants
15
+ from sky.utils import serialize_utils
12
16
 
13
17
  if typing.TYPE_CHECKING:
14
18
  from sky import backends
15
19
  from sky import clouds
16
- from sky import models
17
20
  from sky.provision.kubernetes import utils as kubernetes_utils
18
21
 
19
22
  handlers: Dict[str, Any] = {}
@@ -21,6 +24,9 @@ handlers: Dict[str, Any] = {}
21
24
 
22
25
  def pickle_and_encode(obj: Any) -> str:
23
26
  try:
27
+ # Apply backwards compatibility processing at the lowest level
28
+ # to catch any handles that might have bypassed the encoders
29
+ obj = serialize_utils.prepare_handle_for_backwards_compatibility(obj)
24
30
  return base64.b64encode(pickle.dumps(obj)).decode('utf-8')
25
31
  except TypeError as e:
26
32
  raise ValueError(f'Failed to pickle object: {obj}') from e
@@ -51,13 +57,29 @@ def default_encoder(return_value: Any) -> Any:
51
57
 
52
58
 
53
59
  @register_encoder('status')
54
- def encode_status(clusters: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
60
+ def encode_status(
61
+ clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
62
+ response = []
55
63
  for cluster in clusters:
56
- cluster['status'] = cluster['status'].value
57
- cluster['handle'] = pickle_and_encode(cluster['handle'])
58
- cluster['storage_mounts_metadata'] = pickle_and_encode(
59
- cluster['storage_mounts_metadata'])
60
- return clusters
64
+ response_cluster = cluster.model_dump(exclude_none=True)
65
+ # These default setting is needed because last_use and status_updated_at
66
+ # used to be not optional.
67
+ # TODO(syang): remove this after v0.10.7 or v0.11.0
68
+ if 'last_use' not in response_cluster:
69
+ response_cluster['last_use'] = ''
70
+ if 'status_updated_at' not in response_cluster:
71
+ response_cluster['status_updated_at'] = 0
72
+ response_cluster['status'] = cluster['status'].value
73
+ handle = serialize_utils.prepare_handle_for_backwards_compatibility(
74
+ cluster['handle'])
75
+ response_cluster['handle'] = pickle_and_encode(handle)
76
+ # TODO (syang) We still need to return this field for backwards
77
+ # compatibility.
78
+ # Remove this field at or after v0.10.7 or v0.11.0
79
+ response_cluster['storage_mounts_metadata'] = pickle_and_encode(
80
+ None) # Always returns None.
81
+ response.append(response_cluster)
82
+ return response
61
83
 
62
84
 
63
85
  @register_encoder('launch', 'exec', 'jobs.launch')
@@ -65,6 +87,7 @@ def encode_launch(
65
87
  job_id_handle: Tuple[Optional[int], Optional['backends.ResourceHandle']]
66
88
  ) -> Dict[str, Any]:
67
89
  job_id, handle = job_id_handle
90
+ handle = serialize_utils.prepare_handle_for_backwards_compatibility(handle)
68
91
  return {
69
92
  'job_id': job_id,
70
93
  'handle': pickle_and_encode(handle),
@@ -73,14 +96,21 @@ def encode_launch(
73
96
 
74
97
  @register_encoder('start')
75
98
  def encode_start(resource_handle: 'backends.CloudVmRayResourceHandle') -> str:
99
+ resource_handle = (
100
+ serialize_utils.prepare_handle_for_backwards_compatibility(
101
+ resource_handle))
76
102
  return pickle_and_encode(resource_handle)
77
103
 
78
104
 
79
105
  @register_encoder('queue')
80
- def encode_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
106
+ def encode_queue(
107
+ jobs: List[responses.ClusterJobRecord],) -> List[Dict[str, Any]]:
108
+ response = []
81
109
  for job in jobs:
82
- job['status'] = job['status'].value
83
- return jobs
110
+ response_job = job.model_dump()
111
+ response_job['status'] = job['status'].value
112
+ response.append(response_job)
113
+ return response
84
114
 
85
115
 
86
116
  @register_encoder('status_kubernetes')
@@ -88,7 +118,7 @@ def encode_status_kubernetes(
88
118
  return_value: Tuple[
89
119
  List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
90
120
  List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
91
- List[Dict[str, Any]], Optional[str]]
121
+ List[responses.ManagedJobRecord], Optional[str]]
92
122
  ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]],
93
123
  Optional[str]]:
94
124
  all_clusters, unmanaged_clusters, all_jobs, context = return_value
@@ -102,6 +132,7 @@ def encode_status_kubernetes(
102
132
  encoded_cluster = dataclasses.asdict(cluster)
103
133
  encoded_cluster['status'] = encoded_cluster['status'].value
104
134
  encoded_unmanaged_clusters.append(encoded_cluster)
135
+ all_jobs = [job.model_dump(by_alias=True) for job in all_jobs]
105
136
  return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
106
137
 
107
138
 
@@ -112,25 +143,68 @@ def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
112
143
  return jobs
113
144
 
114
145
 
115
- @register_encoder('serve.status')
116
- def encode_serve_status(
146
+ @register_encoder('jobs.queue_v2')
147
+ def encode_jobs_queue_v2(
148
+ jobs_or_tuple) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
149
+ # Support returning either a plain jobs list or a (jobs, total) tuple
150
+ status_counts: Dict[str, int] = {}
151
+ if isinstance(jobs_or_tuple, tuple):
152
+ if len(jobs_or_tuple) == 2:
153
+ jobs, total = jobs_or_tuple
154
+ total_no_filter = total
155
+ elif len(jobs_or_tuple) == 4:
156
+ jobs, total, status_counts, total_no_filter = jobs_or_tuple
157
+ else:
158
+ raise ValueError(f'Invalid jobs tuple: {jobs_or_tuple}')
159
+ else:
160
+ jobs = jobs_or_tuple
161
+ total = None
162
+ jobs_dict = [job.model_dump(by_alias=True) for job in jobs]
163
+ for job in jobs_dict:
164
+ job['status'] = job['status'].value
165
+ if total is None:
166
+ return jobs_dict
167
+ return {
168
+ 'jobs': jobs_dict,
169
+ 'total': total,
170
+ 'total_no_filter': total_no_filter,
171
+ 'status_counts': status_counts
172
+ }
173
+
174
+
175
+ def _encode_serve_status(
117
176
  service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
118
177
  for service_status in service_statuses:
119
178
  service_status['status'] = service_status['status'].value
120
179
  for replica_info in service_status.get('replica_info', []):
121
180
  replica_info['status'] = replica_info['status'].value
122
- replica_info['handle'] = pickle_and_encode(replica_info['handle'])
181
+ handle = serialize_utils.prepare_handle_for_backwards_compatibility(
182
+ replica_info['handle'])
183
+ replica_info['handle'] = pickle_and_encode(handle)
123
184
  return service_statuses
124
185
 
125
186
 
187
+ @register_encoder('serve.status')
188
+ def encode_serve_status(
189
+ service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
190
+ return _encode_serve_status(service_statuses)
191
+
192
+
193
+ @register_encoder('jobs.pool_status')
194
+ def encode_jobs_pool_status(
195
+ pool_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
196
+ return _encode_serve_status(pool_statuses)
197
+
198
+
126
199
  @register_encoder('cost_report')
127
200
  def encode_cost_report(
128
201
  cost_report: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
129
202
  for cluster_report in cost_report:
130
203
  if cluster_report['status'] is not None:
131
204
  cluster_report['status'] = cluster_report['status'].value
132
- cluster_report['resources'] = pickle_and_encode(
133
- cluster_report['resources'])
205
+ if 'resources' in cluster_report:
206
+ cluster_report['resources'] = pickle_and_encode(
207
+ cluster_report['resources'])
134
208
  return cost_report
135
209
 
136
210
 
@@ -142,22 +216,66 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
142
216
 
143
217
  @register_encoder('storage_ls')
144
218
  def encode_storage_ls(
145
- return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
146
- for storage_info in return_value:
219
+ return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
220
+ response_list = [storage_info.model_dump() for storage_info in return_value]
221
+ for storage_info in response_list:
147
222
  storage_info['status'] = storage_info['status'].value
148
223
  storage_info['store'] = [store.value for store in storage_info['store']]
149
- return return_value
224
+ return response_list
225
+
226
+
227
+ @register_encoder('volume_list')
228
+ def encode_volume_list(
229
+ return_value: List[responses.VolumeRecord]) -> List[Dict[str, Any]]:
230
+ return [volume_info.model_dump() for volume_info in return_value]
150
231
 
151
232
 
152
233
  @register_encoder('job_status')
153
- def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
234
+ def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
154
235
  for job_id in return_value.keys():
155
236
  if return_value[job_id] is not None:
156
237
  return_value[job_id] = return_value[job_id].value
157
- return return_value
238
+ return {str(k): v for k, v in return_value.items()}
158
239
 
159
240
 
160
241
  @register_encoder('kubernetes_node_info')
161
242
  def encode_kubernetes_node_info(
162
243
  return_value: 'models.KubernetesNodesInfo') -> Dict[str, Any]:
163
244
  return return_value.to_dict()
245
+
246
+
247
+ @register_encoder('endpoints')
248
+ def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
249
+ return {str(k): v for k, v in return_value.items()}
250
+
251
+
252
+ @register_encoder('realtime_kubernetes_gpu_availability')
253
+ def encode_realtime_gpu_availability(
254
+ return_value: List[Tuple[str,
255
+ List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
256
+ # Convert RealtimeGpuAvailability namedtuples to lists
257
+ # for JSON serialization.
258
+ encoded = []
259
+ for context, gpu_list in return_value:
260
+ converted_gpu_list = []
261
+ for gpu in gpu_list:
262
+ assert isinstance(gpu, models.RealtimeGpuAvailability), (
263
+ f'Expected RealtimeGpuAvailability, got {type(gpu)}')
264
+ converted_gpu_list.append(list(gpu))
265
+ encoded.append((context, converted_gpu_list))
266
+ return encoded
267
+
268
+
269
+ @register_encoder('list_accelerators')
270
+ def encode_list_accelerators(
271
+ return_value: Dict[str, List[Any]]) -> Dict[str, Any]:
272
+ encoded: Dict[str, Any] = {}
273
+ for accelerator_name, instances in return_value.items():
274
+ # Convert InstanceTypeInfo namedtuples to lists for JSON serialization.
275
+ converted_instances: List[Any] = []
276
+ for instance in instances:
277
+ assert isinstance(instance, common.InstanceTypeInfo), (
278
+ f'Expected InstanceTypeInfo, got {type(instance)}')
279
+ converted_instances.append(list(instance))
280
+ encoded[accelerator_name] = converted_instances
281
+ return encoded
@@ -0,0 +1,106 @@
1
+ """Request execution threads management."""
2
+
3
+ import concurrent.futures
4
+ import threading
5
+ from typing import Callable, Set
6
+
7
+ from sky import exceptions
8
+ from sky import sky_logging
9
+ from sky.utils import atomic
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ class OnDemandThreadExecutor(concurrent.futures.Executor):
15
+ """An executor that creates a new thread for each task and destroys it
16
+ after the task is completed.
17
+
18
+ Note(dev):
19
+ We raise an error instead of queuing the request if the limit is reached, so
20
+ that:
21
+ 1. the request might be handled by other processes that have idle workers
22
+ upon retry;
23
+ 2. if not, then users can be clearly hinted that they need to scale the API
24
+ server to support higher concurrency.
25
+ So this executor is only suitable for carefully selected cases where the
26
+ error can be properly handled by caller. To make this executor general, we
27
+ need to support configuring the queuing behavior (exception or queueing).
28
+ """
29
+
30
+ def __init__(self, name: str, max_workers: int):
31
+ self.name: str = name
32
+ self.max_workers: int = max_workers
33
+ self.running: atomic.AtomicInt = atomic.AtomicInt(0)
34
+ self._shutdown: bool = False
35
+ self._shutdown_lock: threading.Lock = threading.Lock()
36
+ self._threads: Set[threading.Thread] = set()
37
+ self._threads_lock: threading.Lock = threading.Lock()
38
+
39
+ def _cleanup_thread(self, thread: threading.Thread):
40
+ with self._threads_lock:
41
+ self._threads.discard(thread)
42
+
43
+ def _task_wrapper(self, fn: Callable, fut: concurrent.futures.Future, /,
44
+ *args, **kwargs):
45
+ try:
46
+ result = fn(*args, **kwargs)
47
+ fut.set_result(result)
48
+ except Exception as e: # pylint: disable=broad-except
49
+ logger.debug(f'Executor [{self.name}] error executing {fn}: {e}')
50
+ fut.set_exception(e)
51
+ finally:
52
+ self.running.decrement()
53
+ self._cleanup_thread(threading.current_thread())
54
+
55
+ def check_available(self, borrow: bool = False) -> int:
56
+ """Check if there are available workers.
57
+
58
+ Args:
59
+ borrow: If True, the caller borrow a worker from the executor.
60
+ The caller is responsible for returning the worker to the
61
+ executor after the task is completed.
62
+ """
63
+ count = self.running.increment()
64
+ if count > self.max_workers:
65
+ self.running.decrement()
66
+ raise exceptions.ConcurrentWorkerExhaustedError(
67
+ f'Maximum concurrent workers {self.max_workers} of threads '
68
+ f'executor [{self.name}] reached')
69
+ if not borrow:
70
+ self.running.decrement()
71
+ return count
72
+
73
+ def submit(self, fn, /, *args, **kwargs):
74
+ with self._shutdown_lock:
75
+ if self._shutdown:
76
+ raise RuntimeError(
77
+ 'Cannot submit task after executor is shutdown')
78
+ count = self.check_available(borrow=True)
79
+ fut: concurrent.futures.Future = concurrent.futures.Future()
80
+ # Name is assigned for debugging purpose, duplication is fine
81
+ thread = threading.Thread(target=self._task_wrapper,
82
+ name=f'{self.name}-{count}',
83
+ args=(fn, fut, *args),
84
+ kwargs=kwargs,
85
+ daemon=True)
86
+ with self._threads_lock:
87
+ self._threads.add(thread)
88
+ try:
89
+ thread.start()
90
+ except Exception as e:
91
+ self.running.decrement()
92
+ self._cleanup_thread(thread)
93
+ fut.set_exception(e)
94
+ raise
95
+ assert thread.ident is not None, 'Thread should be started'
96
+ return fut
97
+
98
+ def shutdown(self, wait=True):
99
+ with self._shutdown_lock:
100
+ self._shutdown = True
101
+ if not wait:
102
+ return
103
+ with self._threads_lock:
104
+ threads = list(self._threads)
105
+ for t in threads:
106
+ t.join()