skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,218 @@
1
+ """JWT-based service account token management for SkyPilot."""
2
+
3
+ import contextlib
4
+ import datetime
5
+ import hashlib
6
+ import os
7
+ import secrets
8
+ import threading
9
+ from typing import Any, Dict, Generator, Optional
10
+
11
+ import filelock
12
+ import jwt
13
+
14
+ from sky import global_user_state
15
+ from sky import sky_logging
16
+
17
+ logger = sky_logging.init_logger(__name__)
18
+
19
+ # JWT Configuration
20
+ JWT_ALGORITHM = 'HS256'
21
+ JWT_ISSUER = 'sky' # Shortened for compact tokens
22
+ JWT_SECRET_DB_KEY = 'jwt_secret'
23
+
24
+ # File lock for JWT secret initialization
25
+ JWT_SECRET_LOCK_PATH = os.path.expanduser('~/.sky/.jwt_secret_init.lock')
26
+ JWT_SECRET_LOCK_TIMEOUT_SECONDS = 20
27
+
28
+
29
+ @contextlib.contextmanager
30
+ def _jwt_secret_lock() -> Generator[None, None, None]:
31
+ """Context manager for JWT secret initialization lock."""
32
+ try:
33
+ with filelock.FileLock(JWT_SECRET_LOCK_PATH,
34
+ JWT_SECRET_LOCK_TIMEOUT_SECONDS):
35
+ yield
36
+ except filelock.Timeout as e:
37
+ raise RuntimeError(f'Failed to initialize JWT secret due to a timeout '
38
+ f'when trying to acquire the lock at '
39
+ f'{JWT_SECRET_LOCK_PATH}. '
40
+ 'Please try again or manually remove the lock '
41
+ f'file if you believe it is stale.') from e
42
+
43
+
44
+ class TokenService:
45
+ """Service for managing JWT-based service account tokens."""
46
+
47
+ def __init__(self):
48
+ self.secret_key = None
49
+ self.init_lock = threading.Lock()
50
+
51
+ def _lazy_initialize(self):
52
+ if self.secret_key is not None:
53
+ return
54
+ with self.init_lock:
55
+ if self.secret_key is not None:
56
+ return
57
+ self.secret_key = self._get_or_generate_secret()
58
+
59
+ def _get_or_generate_secret(self) -> str:
60
+ """Get JWT secret from database or generate a new one."""
61
+
62
+ def _get_secret_from_db():
63
+ try:
64
+ db_secret = global_user_state.get_system_config(
65
+ JWT_SECRET_DB_KEY)
66
+ if db_secret:
67
+ logger.debug('Retrieved existing JWT secret from database')
68
+ return db_secret
69
+ except Exception as e: # pylint: disable=broad-except
70
+ logger.debug(f'Failed to get JWT secret from database: {e}')
71
+ return None
72
+
73
+ # Try to get from database (persistent across deployments)
74
+ token_from_db = _get_secret_from_db()
75
+ if token_from_db:
76
+ return token_from_db
77
+
78
+ with _jwt_secret_lock():
79
+ token_from_db = _get_secret_from_db()
80
+ if token_from_db:
81
+ return token_from_db
82
+ # Generate a new secret and store in database
83
+ new_secret = secrets.token_urlsafe(64)
84
+ try:
85
+ global_user_state.set_system_config(JWT_SECRET_DB_KEY,
86
+ new_secret)
87
+ logger.info(
88
+ 'Generated new JWT secret and stored in database. '
89
+ 'This secret will persist across API server restarts.')
90
+ except Exception as e: # pylint: disable=broad-except
91
+ logger.warning(
92
+ f'Failed to store new JWT secret in database: {e}. '
93
+ f'Using in-memory secret (tokens will not persist '
94
+ f'across restarts).')
95
+
96
+ return new_secret
97
+
98
+ def create_token(self,
99
+ creator_user_id: str,
100
+ service_account_user_id: str,
101
+ token_name: str,
102
+ expires_in_days: Optional[int] = None) -> Dict[str, Any]:
103
+ """Create a new JWT service account token.
104
+
105
+ Args:
106
+ creator_user_id: The creator's user hash
107
+ service_account_user_id: The service account's own user ID
108
+ token_name: Descriptive name for the token
109
+ expires_in_days: Optional expiration in days
110
+
111
+ Returns:
112
+ Dict containing token info including the JWT token
113
+ """
114
+ self._lazy_initialize()
115
+ now = datetime.datetime.now(datetime.timezone.utc)
116
+ token_id = secrets.token_urlsafe(12) # Shorter ID for JWT
117
+
118
+ # Build minimal JWT payload with single-character field names for
119
+ # compactness
120
+ payload = {
121
+ 'i': JWT_ISSUER, # Issuer (use constant)
122
+ 't': int(now.timestamp()), # Issued at (shortened from 'iat')
123
+ # Service account user ID (shortened from 'sub')
124
+ 'u': service_account_user_id,
125
+ 'k': token_id, # Token ID (shortened from 'token_id')
126
+ 'y': 'sa', # Type: service account (shortened from 'type')
127
+ }
128
+
129
+ # Add expiration if specified
130
+ expires_at = None
131
+ if expires_in_days:
132
+ exp_time = now + datetime.timedelta(days=expires_in_days)
133
+ payload['e'] = int(
134
+ exp_time.timestamp()) # Expiration (shortened from 'exp')
135
+ expires_at = int(exp_time.timestamp())
136
+
137
+ # Generate JWT
138
+ jwt_token = jwt.encode(payload,
139
+ self.secret_key,
140
+ algorithm=JWT_ALGORITHM)
141
+
142
+ # Create token with SkyPilot prefix
143
+ full_token = f'sky_{jwt_token}'
144
+
145
+ # Generate hash for database storage (we still hash the full token)
146
+ token_hash = hashlib.sha256(full_token.encode()).hexdigest()
147
+
148
+ return {
149
+ 'token_id': token_id,
150
+ 'token': full_token,
151
+ 'token_hash': token_hash,
152
+ 'creator_user_id': creator_user_id,
153
+ 'service_account_user_id': service_account_user_id,
154
+ 'token_name': token_name,
155
+ 'created_at': int(now.timestamp()),
156
+ 'expires_at': expires_at,
157
+ }
158
+
159
+ def verify_token(self, token: str) -> Optional[Dict[str, Any]]:
160
+ """Verify and decode a JWT token.
161
+
162
+ Args:
163
+ token: The full token (with sky_ prefix)
164
+
165
+ Returns:
166
+ Decoded token payload or None if invalid
167
+ """
168
+ self._lazy_initialize()
169
+ if not token.startswith('sky_'):
170
+ return None
171
+
172
+ # Remove the sky_ prefix
173
+ jwt_token = token[4:]
174
+
175
+ try:
176
+ # Decode and verify JWT (without issuer verification)
177
+ payload = jwt.decode(jwt_token,
178
+ self.secret_key,
179
+ algorithms=[JWT_ALGORITHM])
180
+
181
+ # Manually verify issuer using our shortened field name
182
+ token_issuer = payload.get('i')
183
+ if token_issuer != JWT_ISSUER:
184
+ logger.warning(f'Invalid token issuer: {token_issuer}')
185
+ return None
186
+
187
+ # Verify token type
188
+ token_type = payload.get('y')
189
+ if token_type != 'sa':
190
+ logger.warning(f'Invalid token type: {token_type}')
191
+ return None
192
+
193
+ # Convert shortened field names back to standard names for
194
+ # compatibility
195
+ normalized_payload = {
196
+ 'iss': payload.get('i'), # issuer
197
+ 'iat': payload.get('t'), # issued at
198
+ 'sub': payload.get('u'), # subject (service account user ID)
199
+ 'token_id': payload.get('k'), # token ID
200
+ 'type': 'service_account', # expand shortened type
201
+ }
202
+
203
+ # Add expiration if present
204
+ if 'e' in payload:
205
+ normalized_payload['exp'] = payload['e']
206
+
207
+ return normalized_payload
208
+
209
+ except jwt.ExpiredSignatureError:
210
+ logger.warning('Token has expired')
211
+ return None
212
+ except jwt.InvalidTokenError as e:
213
+ logger.warning(f'Invalid token: {e}')
214
+ return None
215
+
216
+
217
+ # Singleton instance
218
+ token_service = TokenService()
@@ -1,8 +1,8 @@
1
1
  """Accelerator registry."""
2
2
  import typing
3
- from typing import Optional
3
+ from typing import List, Optional
4
4
 
5
- from sky.clouds import service_catalog
5
+ from sky import catalog
6
6
  from sky.utils import rich_utils
7
7
  from sky.utils import ux_utils
8
8
 
@@ -34,7 +34,8 @@ if typing.TYPE_CHECKING:
34
34
 
35
35
  # Use a cached version of accelerators to cloud mapping, so that we don't have
36
36
  # to download and read the catalog file for every cloud locally.
37
- _accelerator_df = service_catalog.common.read_catalog('common/accelerators.csv')
37
+ _accelerator_df = catalog.common.read_catalog('common/accelerators.csv')
38
+ _memory_df = catalog.common.read_catalog('common/metadata.csv')
38
39
 
39
40
  # List of non-GPU accelerators that are supported by our backend for job queue
40
41
  # scheduling.
@@ -45,6 +46,32 @@ _SCHEDULABLE_NON_GPU_ACCELERATORS = [
45
46
  ]
46
47
 
47
48
 
49
+ def get_devices_by_memory(memory: float,
50
+ plus: bool = False,
51
+ manufacturer: Optional[str] = None) -> List[str]:
52
+ """Returns a list of device names that meet the memory and manufacturer
53
+ requirements.
54
+
55
+ Args:
56
+ memory: The minimum memory size in GB.
57
+ plus: If True, returns devices with memory >= memory, otherwise returns
58
+ devices with memory == memory.
59
+ manufacturer: The manufacturer of the GPU.
60
+ """
61
+
62
+ # Filter by memory requirements
63
+ if plus:
64
+ df = _memory_df[_memory_df['MemoryGB'] >= memory]
65
+ else:
66
+ df = _memory_df[_memory_df['MemoryGB'] == memory]
67
+
68
+ # Filter by manufacturer if specified
69
+ if manufacturer is not None:
70
+ df = df[df['Manufacturer'].str.lower() == manufacturer.lower()]
71
+
72
+ return df['GPU'].tolist()
73
+
74
+
48
75
  def is_schedulable_non_gpu_accelerator(accelerator_name: str) -> bool:
49
76
  """Returns if this accelerator is a 'schedulable' non-GPU accelerator."""
50
77
  for name in _SCHEDULABLE_NON_GPU_ACCELERATORS:
@@ -80,10 +107,12 @@ def canonicalize_accelerator_name(accelerator: str,
80
107
  if not names and cloud_str in ['Kubernetes', None]:
81
108
  with rich_utils.safe_status(
82
109
  ux_utils.spinner_message('Listing accelerators on Kubernetes')):
83
- searched = service_catalog.list_accelerators(
110
+ # Only search for Kubernetes to reduce the lookup cost.
111
+ # For other clouds, the catalog has been searched in previous steps.
112
+ searched = catalog.list_accelerators(
84
113
  name_filter=accelerator,
85
114
  case_sensitive=False,
86
- clouds=cloud_str,
115
+ clouds='Kubernetes',
87
116
  )
88
117
  names = list(searched.keys())
89
118
  if accelerator in names:
@@ -1,9 +1,10 @@
1
1
  """Admin policy utils."""
2
+ import contextlib
2
3
  import copy
3
4
  import importlib
4
- import os
5
- import tempfile
6
- from typing import Optional, Tuple, Union
5
+ import typing
6
+ from typing import Iterator, Optional, Tuple, Union
7
+ import urllib.parse
7
8
 
8
9
  import colorama
9
10
 
@@ -13,25 +14,45 @@ from sky import exceptions
13
14
  from sky import sky_logging
14
15
  from sky import skypilot_config
15
16
  from sky import task as task_lib
17
+ from sky.server.requests import request_names
16
18
  from sky.utils import common_utils
17
19
  from sky.utils import config_utils
18
20
  from sky.utils import ux_utils
19
21
 
20
22
  logger = sky_logging.init_logger(__name__)
21
23
 
24
+ if typing.TYPE_CHECKING:
25
+ from sky import models
22
26
 
23
- def _get_policy_cls(
24
- policy: Optional[str]) -> Optional[admin_policy.AdminPolicy]:
27
+
28
+ def _is_url(policy_string: str) -> bool:
29
+ """Check if the policy string is a URL."""
30
+ try:
31
+ parsed = urllib.parse.urlparse(policy_string)
32
+ return parsed.scheme in ('http', 'https')
33
+ except Exception: # pylint: disable=broad-except
34
+ return False
35
+
36
+
37
+ def _get_policy_impl(
38
+ policy_location: Optional[str]
39
+ ) -> Optional[admin_policy.PolicyInterface]:
25
40
  """Gets admin-defined policy."""
26
- if policy is None:
41
+ if policy_location is None:
27
42
  return None
43
+
44
+ if _is_url(policy_location):
45
+ # Use the built-in URL policy class when an URL is specified.
46
+ return admin_policy.RestfulAdminPolicy(policy_location)
47
+
48
+ # Handle module path format
28
49
  try:
29
- module_path, class_name = policy.rsplit('.', 1)
50
+ module_path, class_name = policy_location.rsplit('.', 1)
30
51
  module = importlib.import_module(module_path)
31
52
  except ImportError as e:
32
53
  with ux_utils.print_exception_no_traceback():
33
54
  raise ImportError(
34
- f'Failed to import policy module: {policy}. '
55
+ f'Failed to import policy module: {policy_location}. '
35
56
  'Please check if the module is installed in your Python '
36
57
  'environment.') from e
37
58
 
@@ -43,19 +64,48 @@ def _get_policy_cls(
43
64
  f'Could not find {class_name} class in module {module_path}. '
44
65
  'Please check with your policy admin for details.') from e
45
66
 
46
- # Check if the module implements the AdminPolicy interface.
67
+ # Currently we only allow users to define subclass of AdminPolicy
68
+ # instead of inheriting from PolicyInterface or PolicyTemplate.
47
69
  if not issubclass(policy_cls, admin_policy.AdminPolicy):
48
70
  with ux_utils.print_exception_no_traceback():
49
71
  raise ValueError(
50
- f'Policy class {policy!r} does not implement the AdminPolicy '
51
- 'interface. Please check with your policy admin for details.')
52
- return policy_cls
72
+ f'Policy class {policy_cls!r} does not implement the '
73
+ 'AdminPolicy interface. Please check with your policy admin '
74
+ 'for details.')
75
+ return policy_cls()
76
+
77
+
78
+ @contextlib.contextmanager
79
+ def apply_and_use_config_in_current_request(
80
+ entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
81
+ request_name: request_names.AdminPolicyRequestName,
82
+ request_options: Optional[admin_policy.RequestOptions] = None,
83
+ at_client_side: bool = False,
84
+ ) -> Iterator['dag_lib.Dag']:
85
+ """Applies an admin policy and override SkyPilot config for current request
86
+
87
+ This is a helper function of `apply()` that applies an admin policy and
88
+ overrides the SkyPilot config for the current request as a context manager.
89
+ The original SkyPilot config will be restored when the context manager is
90
+ exited.
91
+
92
+ Refer to `apply()` for more details.
93
+ """
94
+ original_config = skypilot_config.to_dict()
95
+ dag, mutated_config = apply(entrypoint, request_name, request_options,
96
+ at_client_side)
97
+ if mutated_config != original_config:
98
+ with skypilot_config.replace_skypilot_config(mutated_config):
99
+ yield dag
100
+ else:
101
+ yield dag
53
102
 
54
103
 
55
104
  def apply(
56
105
  entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
57
- use_mutated_config_in_current_request: bool = True,
106
+ request_name: request_names.AdminPolicyRequestName,
58
107
  request_options: Optional[admin_policy.RequestOptions] = None,
108
+ at_client_side: bool = False,
59
109
  ) -> Tuple['dag_lib.Dag', config_utils.Config]:
60
110
  """Applies an admin policy (if registered) to a DAG or a task.
61
111
 
@@ -79,29 +129,41 @@ def apply(
79
129
  else:
80
130
  dag = entrypoint
81
131
 
82
- policy = skypilot_config.get_nested(('admin_policy',), None)
83
- policy_cls = _get_policy_cls(policy)
84
- if policy_cls is None:
132
+ policy_location = skypilot_config.get_nested(('admin_policy',), None)
133
+ policy = _get_policy_impl(policy_location)
134
+ if policy is None:
85
135
  return dag, skypilot_config.to_dict()
86
136
 
87
- logger.info(f'Applying policy: {policy}')
88
- original_config = skypilot_config.to_dict()
89
- config = copy.deepcopy(original_config)
137
+ user = None
138
+ if at_client_side:
139
+ logger.info(f'Applying client admin policy: {policy}')
140
+ else:
141
+ # When being called by the server, the middleware has set the
142
+ # current user and this information is available at this point.
143
+ user = common_utils.get_current_user()
144
+ logger.info(f'Applying server admin policy: {policy}')
145
+ config = copy.deepcopy(skypilot_config.to_dict())
90
146
  mutated_dag = dag_lib.Dag()
91
147
  mutated_dag.name = dag.name
92
148
 
93
149
  mutated_config = None
94
150
  for task in dag.tasks:
95
- user_request = admin_policy.UserRequest(task, config, request_options)
151
+ user_request = admin_policy.UserRequest(task, config, request_name,
152
+ request_options, at_client_side,
153
+ user)
96
154
  try:
97
- mutated_user_request = policy_cls.validate_and_mutate(user_request)
155
+ mutated_user_request = policy.apply(user_request)
156
+ # Avoid duplicate exception wrapping.
157
+ except exceptions.UserRequestRejectedByPolicy as e:
158
+ with ux_utils.print_exception_no_traceback():
159
+ raise e
98
160
  except Exception as e: # pylint: disable=broad-except
99
161
  with ux_utils.print_exception_no_traceback():
100
162
  raise exceptions.UserRequestRejectedByPolicy(
101
163
  f'{colorama.Fore.RED}User request rejected by policy '
102
164
  f'{policy!r}{colorama.Fore.RESET}: '
103
165
  f'{common_utils.format_exception(e, use_bracket=True)}'
104
- ) from e
166
+ ) from None
105
167
  if mutated_config is None:
106
168
  mutated_config = mutated_user_request.skypilot_config
107
169
  else:
@@ -126,22 +188,6 @@ def apply(
126
188
  mutated_dag.graph.add_edge(mutated_dag.tasks[u_idx],
127
189
  mutated_dag.tasks[v_idx])
128
190
 
129
- if (use_mutated_config_in_current_request and
130
- original_config != mutated_config):
131
- with tempfile.NamedTemporaryFile(
132
- delete=False,
133
- mode='w',
134
- prefix='policy-mutated-skypilot-config-',
135
- suffix='.yaml') as temp_file:
136
-
137
- common_utils.dump_yaml(temp_file.name, dict(**mutated_config))
138
- os.environ[skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = temp_file.name
139
- logger.debug(f'Updated SkyPilot config: {temp_file.name}')
140
- # TODO(zhwu): This is not a clean way to update the SkyPilot config,
141
- # because we are resetting the global context for a single DAG,
142
- # which is conceptually weird.
143
- importlib.reload(skypilot_config)
144
-
145
191
  logger.debug(f'Mutated user request: {mutated_user_request}')
146
192
  mutated_dag.policy_applied = True
147
193
  return mutated_dag, mutated_config
sky/utils/annotations.py CHANGED
@@ -1,14 +1,19 @@
1
1
  """Annotations for public APIs."""
2
2
 
3
3
  import functools
4
- from typing import Callable, Literal
4
+ from typing import Callable, Literal, TypeVar
5
+
6
+ from typing_extensions import ParamSpec
5
7
 
6
8
  # Whether the current process is a SkyPilot API server process.
7
9
  is_on_api_server = True
8
- FUNCTIONS_NEED_RELOAD_CACHE = []
10
+ _FUNCTIONS_NEED_RELOAD_CACHE = []
11
+
12
+ T = TypeVar('T')
13
+ P = ParamSpec('P')
9
14
 
10
15
 
11
- def client_api(func):
16
+ def client_api(func: Callable[P, T]) -> Callable[P, T]:
12
17
  """Mark a function as a client-side API.
13
18
 
14
19
  Code invoked by server-side functions will find annotations.is_on_api_server
@@ -38,14 +43,20 @@ def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
38
43
  lru_cache_kwargs: Keyword arguments for functools.lru_cache.
39
44
  """
40
45
 
41
- def decorator(func: Callable) -> Callable:
46
+ def decorator(func: Callable[P, T]) -> Callable[P, T]:
42
47
  if scope == 'global':
43
48
  return functools.lru_cache(*lru_cache_args,
44
49
  **lru_cache_kwargs)(func)
45
50
  else:
46
51
  cached_func = functools.lru_cache(*lru_cache_args,
47
52
  **lru_cache_kwargs)(func)
48
- FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
53
+ _FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
49
54
  return cached_func
50
55
 
51
56
  return decorator
57
+
58
+
59
+ def clear_request_level_cache():
60
+ """Clear the request-level cache."""
61
+ for func in _FUNCTIONS_NEED_RELOAD_CACHE:
62
+ func.cache_clear()
@@ -0,0 +1,78 @@
1
+ """Asyncio utilities."""
2
+
3
+ import asyncio
4
+ import functools
5
+ from typing import Set
6
+
7
+ _background_tasks: Set[asyncio.Task] = set()
8
+
9
+
10
+ def shield(func):
11
+ """Shield the decorated async function from cancellation.
12
+
13
+ If the outter coroutine is cancelled, the inner decorated function
14
+ will be protected from cancellation by asyncio.shield(). And we will
15
+ maintain a reference to the the inner task to avoid it get GCed before
16
+ it is done.
17
+
18
+ For example, filelock.AsyncFileLock is not cancellation safe. The
19
+ following code:
20
+
21
+ async def fn_with_lock():
22
+ async with filelock.AsyncFileLock('lock'):
23
+ await asyncio.sleep(1)
24
+
25
+ is equivalent to:
26
+
27
+ # The lock may leak if the cancellation happens in
28
+ # lock.acquire() or lock.release()
29
+ async def fn_with_lock():
30
+ lock = filelock.AsyncFileLock('lock')
31
+ await lock.acquire()
32
+ try:
33
+ await asyncio.sleep(1)
34
+ finally:
35
+ await lock.release()
36
+
37
+ Shilding the function ensures there is no cancellation will happen in the
38
+ function, thus the lock will be released properly:
39
+
40
+ @shield
41
+ async def fn_with_lock()
42
+
43
+ Note that the resource acquisition and release should usually be protected
44
+ in one @shield block but not separately, e.g.:
45
+
46
+ lock = filelock.AsyncFileLock('lock')
47
+
48
+ @shield
49
+ async def acquire():
50
+ await lock.acquire()
51
+
52
+ @shield
53
+ async def release():
54
+ await lock.release()
55
+
56
+ async def fn_with_lock():
57
+ await acquire()
58
+ try:
59
+ do_something()
60
+ finally:
61
+ await release()
62
+
63
+ The above code is not safe because if `fn_with_lock` is cancelled,
64
+ `acquire()` and `release()` will be executed in the background
65
+ concurrently and causes race conditions.
66
+ """
67
+
68
+ @functools.wraps(func)
69
+ async def async_wrapper(*args, **kwargs):
70
+ task = asyncio.create_task(func(*args, **kwargs))
71
+ try:
72
+ return await asyncio.shield(task)
73
+ except asyncio.CancelledError:
74
+ _background_tasks.add(task)
75
+ task.add_done_callback(lambda _: _background_tasks.discard(task))
76
+ raise
77
+
78
+ return async_wrapper