skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/users/server.py ADDED
@@ -0,0 +1,720 @@
1
+ """REST API for workspace management."""
2
+
3
+ import contextlib
4
+ import hashlib
5
+ import os
6
+ import re
7
+ import secrets
8
+ import time
9
+ from typing import Any, Dict, Generator, List
10
+
11
+ import fastapi
12
+ import filelock
13
+
14
+ from sky import global_user_state
15
+ from sky import models
16
+ from sky import sky_logging
17
+ from sky.server import common as server_common
18
+ from sky.server.requests import payloads
19
+ from sky.skylet import constants
20
+ from sky.users import permission
21
+ from sky.users import rbac
22
+ from sky.users import token_service
23
+ from sky.utils import common
24
+ from sky.utils import common_utils
25
+ from sky.utils import resource_checker
26
+
27
+ logger = sky_logging.init_logger(__name__)
28
+
29
+ # Filelocks for the user management.
30
+ USER_LOCK_PATH = os.path.expanduser('~/.sky/.{user_id}.lock')
31
+ USER_LOCK_TIMEOUT_SECONDS = 20
32
+
33
+ router = fastapi.APIRouter()
34
+
35
+
36
+ # All handlers in user handler are sync to get fastAPI run it in a
37
+ # ThreadPoolExecutor to avoid blocking the async event loop.
38
+ # TODO(aylei): make these async once we have the global_user_state async
39
+ # support.
40
+ @router.get('')
41
+ def users() -> List[Dict[str, Any]]:
42
+ """Gets all users."""
43
+ all_users = []
44
+ user_list = global_user_state.get_all_users()
45
+
46
+ users_to_role = {}
47
+ for role in rbac.get_supported_roles():
48
+ user_ids = permission.permission_service.get_users_for_role(role)
49
+ for user_id in user_ids:
50
+ users_to_role[user_id] = role
51
+
52
+ for user in user_list:
53
+ # Filter out service accounts - they have IDs starting with "sa-"
54
+ if user.is_service_account():
55
+ continue
56
+
57
+ all_users.append({
58
+ 'id': user.id,
59
+ 'name': user.name,
60
+ 'created_at': user.created_at,
61
+ 'role': users_to_role.get(user.id, '')
62
+ })
63
+ return all_users
64
+
65
+
66
+ @router.get('/role')
67
+ def get_current_user_role(request: fastapi.Request):
68
+ """Get current user's role."""
69
+ # TODO(hailong): is there a reliable way to get the user
70
+ # hash for the request without 'X-Auth-Request-Email' header?
71
+ auth_user = request.state.auth_user
72
+ if auth_user is None:
73
+ return {'id': '', 'name': '', 'role': rbac.RoleName.ADMIN.value}
74
+ user_roles = permission.permission_service.get_user_roles(auth_user.id)
75
+ return {
76
+ 'id': auth_user.id,
77
+ 'name': auth_user.name,
78
+ 'role': user_roles[0] if user_roles else ''
79
+ }
80
+
81
+
82
+ @router.post('/create')
83
+ def user_create(user_create_body: payloads.UserCreateBody) -> None:
84
+ username = user_create_body.username
85
+ password = user_create_body.password
86
+ role = user_create_body.role
87
+
88
+ if not username or not password:
89
+ raise fastapi.HTTPException(status_code=400,
90
+ detail='Username and password are required')
91
+ if role and role not in rbac.get_supported_roles():
92
+ raise fastapi.HTTPException(status_code=400,
93
+ detail=f'Invalid role: {role}')
94
+
95
+ if not role:
96
+ role = rbac.get_default_role()
97
+
98
+ # Create user
99
+ password_hash = server_common.crypt_ctx.hash(password)
100
+ user_hash = hashlib.md5(
101
+ username.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
102
+ with _user_lock(user_hash):
103
+ # Check if user already exists
104
+ if global_user_state.get_user_by_name(username):
105
+ raise fastapi.HTTPException(
106
+ status_code=400, detail=f'User {username!r} already exists')
107
+ global_user_state.add_or_update_user(
108
+ models.User(id=user_hash, name=username, password=password_hash))
109
+ permission.permission_service.update_role(user_hash, role)
110
+
111
+
112
+ @router.post('/update')
113
+ def user_update(request: fastapi.Request,
114
+ user_update_body: payloads.UserUpdateBody) -> None:
115
+ """Updates the user role."""
116
+ user_id = user_update_body.user_id
117
+ role = user_update_body.role
118
+ password = user_update_body.password
119
+ supported_roles = rbac.get_supported_roles()
120
+ if role and role not in supported_roles:
121
+ raise fastapi.HTTPException(status_code=400,
122
+ detail=f'Invalid role: {role}')
123
+ target_user_roles = permission.permission_service.get_user_roles(user_id)
124
+ need_update_role = role and (not target_user_roles or
125
+ (role != target_user_roles[0]))
126
+ current_user = request.state.auth_user
127
+ if current_user is not None:
128
+ current_user_roles = permission.permission_service.get_user_roles(
129
+ current_user.id)
130
+ if not current_user_roles:
131
+ raise fastapi.HTTPException(status_code=403, detail='Invalid user')
132
+ if current_user_roles[0] != rbac.RoleName.ADMIN.value:
133
+ if need_update_role:
134
+ raise fastapi.HTTPException(
135
+ status_code=403, detail='Only admin can update user role')
136
+ if password and user_id != current_user.id:
137
+ raise fastapi.HTTPException(
138
+ status_code=403,
139
+ detail='Only admin can update password for other users')
140
+ user_info = global_user_state.get_user(user_id)
141
+ if user_info is None:
142
+ raise fastapi.HTTPException(status_code=400,
143
+ detail=f'User {user_id} does not exist')
144
+ # Disallow updating the internal users.
145
+ if need_update_role and user_info.id in [
146
+ common.SERVER_ID, constants.SKYPILOT_SYSTEM_USER_ID
147
+ ]:
148
+ raise fastapi.HTTPException(status_code=400,
149
+ detail=f'Cannot update role for internal '
150
+ f'API server user {user_info.name}')
151
+ if password and user_info.id == constants.SKYPILOT_SYSTEM_USER_ID:
152
+ raise fastapi.HTTPException(
153
+ status_code=400,
154
+ detail=f'Cannot update password for internal '
155
+ f'API server user {user_info.name}')
156
+
157
+ with _user_lock(user_info.id):
158
+ if password:
159
+ password_hash = server_common.crypt_ctx.hash(password)
160
+ global_user_state.add_or_update_user(
161
+ models.User(id=user_info.id,
162
+ name=user_info.name,
163
+ password=password_hash))
164
+ if role and need_update_role:
165
+ # Update user role in casbin policy
166
+ permission.permission_service.update_role(user_info.id, role)
167
+
168
+
169
+ def _delete_user(user_id: str) -> None:
170
+ """Delete a user."""
171
+ user_info = global_user_state.get_user(user_id)
172
+ if user_info is None:
173
+ raise fastapi.HTTPException(status_code=400,
174
+ detail=f'User {user_id} does not exist')
175
+ # Disallow deleting the internal users.
176
+ if user_info.id in [common.SERVER_ID, constants.SKYPILOT_SYSTEM_USER_ID]:
177
+ raise fastapi.HTTPException(status_code=400,
178
+ detail=f'Cannot delete internal '
179
+ f'API server user {user_info.name}')
180
+
181
+ # Check for active clusters and managed jobs owned by the user
182
+ try:
183
+ resource_checker.check_no_active_resources_for_users([(user_id,
184
+ 'delete')])
185
+ except ValueError as e:
186
+ raise fastapi.HTTPException(status_code=400, detail=str(e))
187
+
188
+ with _user_lock(user_id):
189
+ global_user_state.delete_user(user_id)
190
+ permission.permission_service.delete_user(user_id)
191
+
192
+
193
+ @router.post('/delete')
194
+ def user_delete(user_delete_body: payloads.UserDeleteBody) -> None:
195
+ user_id = user_delete_body.user_id
196
+ _delete_user(user_id)
197
+
198
+
199
+ @router.post('/import')
200
+ def user_import(user_import_body: payloads.UserImportBody) -> Dict[str, Any]:
201
+ """Import users from CSV content."""
202
+ csv_content = user_import_body.csv_content
203
+
204
+ if not csv_content:
205
+ raise fastapi.HTTPException(status_code=400,
206
+ detail='CSV content is required')
207
+
208
+ # Parse CSV content
209
+ lines = csv_content.strip().split('\n')
210
+ if len(lines) < 2:
211
+ raise fastapi.HTTPException(
212
+ status_code=400,
213
+ detail='CSV must have at least a header row and one data row')
214
+
215
+ # Parse headers
216
+ headers = [h.strip().lower() for h in lines[0].split(',')]
217
+ required_headers = ['username', 'password', 'role']
218
+
219
+ # Check if all required headers are present
220
+ missing_headers = [
221
+ header for header in required_headers if header not in headers
222
+ ]
223
+ if missing_headers:
224
+ raise fastapi.HTTPException(
225
+ status_code=400,
226
+ detail=f'Missing required columns: {", ".join(missing_headers)}')
227
+
228
+ # Parse user data
229
+ users_to_create = []
230
+ parse_errors = []
231
+
232
+ for i, line in enumerate(lines[1:], start=2):
233
+ if not line.strip():
234
+ continue # Skip empty lines
235
+
236
+ values = [v.strip() for v in line.split(',')]
237
+ if len(values) != len(headers):
238
+ parse_errors.append(f'Line {i}: Invalid number of columns')
239
+ continue
240
+
241
+ user_data = dict(zip(headers, values))
242
+
243
+ # Validate required fields
244
+ if not user_data.get('username') or not user_data.get('password'):
245
+ parse_errors.append(f'Line {i}: Username and password are required')
246
+ continue
247
+
248
+ # Validate role
249
+ role = user_data.get('role', '').lower()
250
+ if role and role not in rbac.get_supported_roles():
251
+ role = rbac.get_default_role() # Default to default role if invalid
252
+ elif not role:
253
+ role = rbac.get_default_role()
254
+
255
+ users_to_create.append({
256
+ 'username': user_data['username'],
257
+ 'password': user_data['password'],
258
+ 'role': role
259
+ })
260
+
261
+ if not users_to_create and parse_errors:
262
+ raise fastapi.HTTPException(
263
+ status_code=400,
264
+ detail=f'No valid users found. Errors: {"; ".join(parse_errors)}')
265
+
266
+ # Create users
267
+ success_count = 0
268
+ error_count = 0
269
+ creation_errors = []
270
+
271
+ for user_data in users_to_create:
272
+ try:
273
+ username = user_data['username']
274
+ password = user_data['password']
275
+ role = user_data['role']
276
+
277
+ # Check if user already exists
278
+ if global_user_state.get_user_by_name(username):
279
+ error_count += 1
280
+ creation_errors.append(f'{username}: User already exists')
281
+ continue
282
+
283
+ # Check if password is already hashed
284
+ if server_common.crypt_ctx.identify(password) is not None:
285
+ # Password is already hashed, use it directly
286
+ password_hash = password
287
+ else:
288
+ # Password is plain text, hash it
289
+ password_hash = server_common.crypt_ctx.hash(password)
290
+
291
+ user_hash = hashlib.md5(
292
+ username.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
293
+
294
+ with _user_lock(user_hash):
295
+ global_user_state.add_or_update_user(
296
+ models.User(id=user_hash,
297
+ name=username,
298
+ password=password_hash))
299
+ permission.permission_service.update_role(user_hash, role)
300
+
301
+ success_count += 1
302
+
303
+ except Exception as e: # pylint: disable=broad-except
304
+ error_count += 1
305
+ creation_errors.append(f'{user_data["username"]}: {str(e)}')
306
+
307
+ return {
308
+ 'success_count': success_count,
309
+ 'error_count': error_count,
310
+ 'total_processed': len(users_to_create),
311
+ 'parse_errors': parse_errors,
312
+ 'creation_errors': creation_errors
313
+ }
314
+
315
+
316
+ @router.get('/export')
317
+ def user_export() -> Dict[str, Any]:
318
+ """Export all users as CSV content."""
319
+ try:
320
+ # Get all users
321
+ user_list = global_user_state.get_all_users()
322
+
323
+ # Create CSV content
324
+ csv_lines = ['username,password,role'] # Header
325
+
326
+ exported_users = []
327
+ for user in user_list:
328
+ # Filter out service accounts - they have IDs starting with "sa-"
329
+ if user.is_service_account():
330
+ continue
331
+
332
+ # Get user role
333
+ user_roles = permission.permission_service.get_user_roles(user.id)
334
+ role = user_roles[0] if user_roles else rbac.get_default_role()
335
+ # Avoid exporting `None` values
336
+ line = ''
337
+ if user.name:
338
+ line += user.name
339
+ line += ','
340
+ if user.password:
341
+ line += user.password
342
+ line += ','
343
+ if role:
344
+ line += role
345
+ csv_lines.append(line)
346
+ exported_users.append(user)
347
+
348
+ csv_content = '\n'.join(csv_lines)
349
+
350
+ return {'csv_content': csv_content, 'user_count': len(exported_users)}
351
+
352
+ except Exception as e:
353
+ raise fastapi.HTTPException(status_code=500,
354
+ detail=f'Failed to export users: {str(e)}')
355
+
356
+
357
+ @contextlib.contextmanager
358
+ def _user_lock(user_id: str) -> Generator[None, None, None]:
359
+ """Context manager for user lock."""
360
+ try:
361
+ with filelock.FileLock(USER_LOCK_PATH.format(user_id=user_id),
362
+ USER_LOCK_TIMEOUT_SECONDS):
363
+ yield
364
+ except filelock.Timeout as e:
365
+ raise RuntimeError(f'Failed to update user due to a timeout '
366
+ f'when trying to acquire the lock at '
367
+ f'{USER_LOCK_PATH.format(user_id=user_id)}. '
368
+ 'Please try again or manually remove the lock '
369
+ f'file if you believe it is stale.') from e
370
+
371
+
372
+ # ===============================
373
+ # Service account tokens
374
+ # ===============================
375
+ # SkyPilot currently does not distinguish between service accounts and service
376
+ # account tokens, i.e. service accounts have a 1-1 mapping to service account
377
+ # tokens.
378
+
379
+
380
+ @router.get('/service-account-tokens')
381
+ def get_service_account_tokens(
382
+ request: fastapi.Request) -> List[Dict[str, Any]]:
383
+ """Get service account tokens. All users can see all tokens."""
384
+ auth_user = request.state.auth_user
385
+ if auth_user is None:
386
+ raise fastapi.HTTPException(status_code=401,
387
+ detail='Authentication required')
388
+
389
+ # All authenticated users can see all tokens
390
+ tokens = global_user_state.get_all_service_account_tokens()
391
+
392
+ result = []
393
+ for token in tokens:
394
+ token_info = {
395
+ 'token_id': token['token_id'],
396
+ 'token_name': token['token_name'],
397
+ 'created_at': token['created_at'],
398
+ 'last_used_at': token['last_used_at'],
399
+ 'expires_at': token['expires_at'],
400
+ 'creator_user_hash': token['creator_user_hash'],
401
+ 'service_account_user_id': token['service_account_user_id'],
402
+ }
403
+
404
+ # Add creator display name
405
+ creator_user = global_user_state.get_user(token['creator_user_hash'])
406
+ token_info[
407
+ 'creator_name'] = creator_user.name if creator_user else 'Unknown'
408
+
409
+ # Add service account name
410
+ sa_user = global_user_state.get_user(token['service_account_user_id'])
411
+ token_info['service_account_name'] = (sa_user.name if sa_user else
412
+ token['token_name'])
413
+
414
+ # Add service account roles
415
+ roles = permission.permission_service.get_user_roles(
416
+ token['service_account_user_id'])
417
+ token_info['service_account_roles'] = roles
418
+
419
+ result.append(token_info)
420
+
421
+ return result
422
+
423
+
424
+ def _generate_service_account_user_id() -> str:
425
+ """Generate a unique user ID for a service account."""
426
+ random_suffix = secrets.token_hex(8) # 16 character hex string
427
+ service_account_id = (f'sa-{random_suffix}')
428
+ return service_account_id
429
+
430
+
431
+ @router.post('/service-account-tokens')
432
+ def create_service_account_token(
433
+ request: fastapi.Request,
434
+ token_body: payloads.ServiceAccountTokenCreateBody) -> Dict[str, Any]:
435
+ """Create a new service account token."""
436
+ auth_user = request.state.auth_user
437
+ if auth_user is None:
438
+ raise fastapi.HTTPException(status_code=401,
439
+ detail='Authentication required')
440
+
441
+ token_name = token_body.token_name.strip()
442
+
443
+ # Check if token follows a valid format
444
+ if not re.match(constants.CLUSTER_NAME_VALID_REGEX, token_name):
445
+ raise fastapi.HTTPException(
446
+ status_code=400,
447
+ detail='Token name must contain only letters, numbers, and '
448
+ 'underscores. Please use a different name.')
449
+
450
+ # Validate expiration (allow 0 as special value for "never expire")
451
+ if (token_body.expires_in_days is not None and
452
+ token_body.expires_in_days < 0):
453
+ raise fastapi.HTTPException(
454
+ status_code=400,
455
+ detail='Expiration days must be positive or 0 for never expire')
456
+
457
+ try:
458
+ # Generate a unique service account user ID
459
+ service_account_user_id = _generate_service_account_user_id()
460
+
461
+ # Create a user entry for the service account
462
+ service_account_user = models.User(id=service_account_user_id,
463
+ name=token_name)
464
+ is_new_user = global_user_state.add_or_update_user(
465
+ service_account_user, allow_duplicate_name=False)
466
+
467
+ if not is_new_user:
468
+ raise fastapi.HTTPException(
469
+ status_code=400,
470
+ detail=f'Service account with name {token_name!r} '
471
+ f'already exists ({service_account_user_id}). '
472
+ 'Please use a different name.')
473
+
474
+ # Add service account to permission system with default role
475
+ # Import here to avoid circular imports
476
+ # pylint: disable=import-outside-toplevel
477
+ from sky.users.permission import permission_service
478
+ permission_service.add_user_if_not_exists(service_account_user_id)
479
+
480
+ # Handle expiration: 0 means "never expire"
481
+ expires_in_days = token_body.expires_in_days
482
+ if expires_in_days == 0:
483
+ expires_in_days = None
484
+
485
+ # Create JWT-based token with service account user ID
486
+ token_data = token_service.token_service.create_token(
487
+ creator_user_id=auth_user.id,
488
+ service_account_user_id=service_account_user_id,
489
+ token_name=token_name,
490
+ expires_in_days=expires_in_days)
491
+
492
+ # Store token metadata in database
493
+ global_user_state.add_service_account_token(
494
+ token_id=token_data['token_id'],
495
+ token_name=token_name,
496
+ token_hash=token_data['token_hash'],
497
+ creator_user_hash=auth_user.id,
498
+ service_account_user_id=service_account_user_id,
499
+ expires_at=token_data['expires_at'])
500
+
501
+ # Return the JWT token only once (never stored in plain text)
502
+ return {
503
+ 'token_id': token_data['token_id'],
504
+ 'token_name': token_name,
505
+ 'token': token_data['token'], # Full JWT token with sky_ prefix
506
+ 'expires_at': token_data['expires_at'],
507
+ 'service_account_user_id': service_account_user_id,
508
+ 'creator_user_id': auth_user.id,
509
+ 'message': 'Please save this token - it will not be shown again!'
510
+ }
511
+
512
+ except Exception as e: # pylint: disable=broad-except
513
+ logger.error(f'Failed to create service account token: {e}')
514
+ raise fastapi.HTTPException(
515
+ status_code=500,
516
+ detail=f'Failed to create service account token: {e}')
517
+
518
+
519
+ @router.post('/service-account-tokens/delete')
520
+ def delete_service_account_token(
521
+ request: fastapi.Request,
522
+ token_body: payloads.ServiceAccountTokenDeleteBody) -> Dict[str, str]:
523
+ """Delete a service account token.
524
+
525
+ Admins can delete any token, users can only delete their own.
526
+ """
527
+ auth_user = request.state.auth_user
528
+ if auth_user is None:
529
+ raise fastapi.HTTPException(status_code=401,
530
+ detail='Authentication required')
531
+
532
+ # Get token info first
533
+ token_info = global_user_state.get_service_account_token(
534
+ token_body.token_id)
535
+ if token_info is None:
536
+ raise fastapi.HTTPException(status_code=404, detail='Token not found')
537
+
538
+ # Check permissions using Casbin policy system
539
+ if not permission.permission_service.check_service_account_token_permission(
540
+ auth_user.id, token_info['creator_user_hash'], 'delete'):
541
+ raise fastapi.HTTPException(
542
+ status_code=403,
543
+ detail='You can only delete your own tokens. Only admins can '
544
+ 'delete tokens owned by other users.')
545
+
546
+ # Try to delete the service account user first to make sure there is no
547
+ # active resources owned by the service account.
548
+ service_account_user_id = token_info['service_account_user_id']
549
+ _delete_user(service_account_user_id)
550
+
551
+ # Delete the token
552
+ deleted = global_user_state.delete_service_account_token(
553
+ token_body.token_id)
554
+ if not deleted:
555
+ raise fastapi.HTTPException(status_code=404, detail='Token not found')
556
+
557
+ return {'message': 'Token deleted successfully'}
558
+
559
+
560
+ @router.post('/service-account-tokens/get-role')
561
+ def get_service_account_role(
562
+ request: fastapi.Request,
563
+ role_body: payloads.ServiceAccountTokenRoleBody) -> Dict[str, Any]:
564
+ """Get the role of a service account."""
565
+ auth_user = request.state.auth_user
566
+ if auth_user is None:
567
+ raise fastapi.HTTPException(status_code=401,
568
+ detail='Authentication required')
569
+
570
+ # Get token info to find the service account user ID
571
+ token_info = global_user_state.get_service_account_token(role_body.token_id)
572
+ if token_info is None:
573
+ raise fastapi.HTTPException(status_code=404, detail='Token not found')
574
+
575
+ # Check permissions - only creator or admin can view roles
576
+ if not permission.permission_service.check_service_account_token_permission(
577
+ auth_user.id, token_info['creator_user_hash'], 'view'):
578
+ raise fastapi.HTTPException(
579
+ status_code=403,
580
+ detail='You can only view roles for your own service accounts. '
581
+ 'Only admins can view roles for service accounts owned by other '
582
+ 'users.')
583
+
584
+ # Get service account roles
585
+ service_account_user_id = token_info['service_account_user_id']
586
+ roles = permission.permission_service.get_user_roles(
587
+ service_account_user_id)
588
+
589
+ return {
590
+ 'token_id': role_body.token_id,
591
+ 'service_account_user_id': service_account_user_id,
592
+ 'roles': roles
593
+ }
594
+
595
+
596
+ @router.post('/service-account-tokens/update-role')
597
+ def update_service_account_role(
598
+ request: fastapi.Request,
599
+ role_body: payloads.ServiceAccountTokenUpdateRoleBody
600
+ ) -> Dict[str, str]:
601
+ """Update the role of a service account."""
602
+ auth_user = request.state.auth_user
603
+ if auth_user is None:
604
+ raise fastapi.HTTPException(status_code=401,
605
+ detail='Authentication required')
606
+
607
+ # Get token info to find the service account user ID
608
+ token_info = global_user_state.get_service_account_token(role_body.token_id)
609
+ if token_info is None:
610
+ raise fastapi.HTTPException(status_code=404, detail='Token not found')
611
+
612
+ # Check permissions - only creator or admin can update roles
613
+ if not permission.permission_service.check_service_account_token_permission(
614
+ auth_user.id, token_info['creator_user_hash'], 'update'):
615
+ raise fastapi.HTTPException(
616
+ status_code=403,
617
+ detail='You can only update roles for your own service accounts. '
618
+ 'Only admins can update roles for service accounts owned by other '
619
+ 'users.')
620
+
621
+ try:
622
+ # Update service account role
623
+ service_account_user_id = token_info['service_account_user_id']
624
+ permission.permission_service.update_role(service_account_user_id,
625
+ role_body.role)
626
+
627
+ return {
628
+ 'message': f'Service account role updated to {role_body.role}',
629
+ 'token_id': role_body.token_id,
630
+ 'service_account_user_id': service_account_user_id,
631
+ 'new_role': role_body.role
632
+ }
633
+ except Exception as e: # pylint: disable=broad-except
634
+ logger.error(f'Failed to update service account role: {e}')
635
+ raise fastapi.HTTPException(
636
+ status_code=500, detail='Failed to update service account role')
637
+
638
+
639
+ @router.post('/service-account-tokens/rotate')
640
+ def rotate_service_account_token(
641
+ request: fastapi.Request,
642
+ token_body: payloads.ServiceAccountTokenRotateBody) -> Dict[str, Any]:
643
+ """Rotate a service account token.
644
+
645
+ Generates a new token value for an existing service account while keeping
646
+ the same service account identity and roles.
647
+ """
648
+ auth_user = request.state.auth_user
649
+ if auth_user is None:
650
+ raise fastapi.HTTPException(status_code=401,
651
+ detail='Authentication required')
652
+
653
+ # Get token info
654
+ token_info = global_user_state.get_service_account_token(
655
+ token_body.token_id)
656
+ if token_info is None:
657
+ raise fastapi.HTTPException(status_code=404, detail='Token not found')
658
+
659
+ # Check permissions - same as delete permission (only creator or admin)
660
+ if not permission.permission_service.check_service_account_token_permission(
661
+ auth_user.id, token_info['creator_user_hash'], 'delete'):
662
+ raise fastapi.HTTPException(
663
+ status_code=403,
664
+ detail='You can only rotate your own tokens. Only admins can '
665
+ 'rotate tokens owned by other users.')
666
+
667
+ # Validate expiration if provided (allow 0 as special value for "never
668
+ # expire")
669
+ if (token_body.expires_in_days is not None and
670
+ token_body.expires_in_days < 0):
671
+ raise fastapi.HTTPException(
672
+ status_code=400,
673
+ detail='Expiration days must be positive or 0 for never expire')
674
+
675
+ try:
676
+ # Use provided expiration or preserve original expiration logic
677
+ expires_in_days = token_body.expires_in_days
678
+ if expires_in_days == 0:
679
+ # Special value 0 means "never expire"
680
+ expires_in_days = None
681
+ elif expires_in_days is None:
682
+ # No expiration specified, try to preserve original expiration
683
+ if token_info['expires_at']:
684
+ current_time = time.time()
685
+ remaining_seconds = token_info['expires_at'] - current_time
686
+ if remaining_seconds > 0:
687
+ expires_in_days = max(1,
688
+ int(remaining_seconds / (24 * 3600)))
689
+ else:
690
+ # Token already expired, default to 30 days
691
+ expires_in_days = 30
692
+
693
+ # Generate new JWT token with same service account user ID
694
+ token_data = token_service.token_service.create_token(
695
+ creator_user_id=token_info['creator_user_hash'],
696
+ service_account_user_id=token_info['service_account_user_id'],
697
+ token_name=token_info['token_name'],
698
+ expires_in_days=expires_in_days)
699
+
700
+ # Update token in database with new token hash
701
+ global_user_state.rotate_service_account_token(
702
+ token_id=token_body.token_id,
703
+ new_token_hash=token_data['token_hash'],
704
+ new_expires_at=token_data['expires_at'])
705
+
706
+ # Return the new JWT token only once (never stored in plain text)
707
+ return {
708
+ 'token_id': token_body.token_id,
709
+ 'token_name': token_info['token_name'],
710
+ 'token': token_data['token'], # Full JWT token with sky_ prefix
711
+ 'expires_at': token_data['expires_at'],
712
+ 'service_account_user_id': token_info['service_account_user_id'],
713
+ 'message': ('Token rotated successfully! Please save this new '
714
+ 'token - it will not be shown again!')
715
+ }
716
+
717
+ except Exception as e: # pylint: disable=broad-except
718
+ logger.error(f'Failed to rotate service account token: {e}')
719
+ raise fastapi.HTTPException(
720
+ status_code=500, detail='Failed to rotate service account token')