skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,218 @@
1
+ """JWT-based service account token management for SkyPilot."""
2
+
3
+ import contextlib
4
+ import datetime
5
+ import hashlib
6
+ import os
7
+ import secrets
8
+ import threading
9
+ from typing import Any, Dict, Generator, Optional
10
+
11
+ import filelock
12
+ import jwt
13
+
14
+ from sky import global_user_state
15
+ from sky import sky_logging
16
+
17
+ logger = sky_logging.init_logger(__name__)
18
+
19
+ # JWT Configuration
20
+ JWT_ALGORITHM = 'HS256'
21
+ JWT_ISSUER = 'sky' # Shortened for compact tokens
22
+ JWT_SECRET_DB_KEY = 'jwt_secret'
23
+
24
+ # File lock for JWT secret initialization
25
+ JWT_SECRET_LOCK_PATH = os.path.expanduser('~/.sky/.jwt_secret_init.lock')
26
+ JWT_SECRET_LOCK_TIMEOUT_SECONDS = 20
27
+
28
+
29
+ @contextlib.contextmanager
30
+ def _jwt_secret_lock() -> Generator[None, None, None]:
31
+ """Context manager for JWT secret initialization lock."""
32
+ try:
33
+ with filelock.FileLock(JWT_SECRET_LOCK_PATH,
34
+ JWT_SECRET_LOCK_TIMEOUT_SECONDS):
35
+ yield
36
+ except filelock.Timeout as e:
37
+ raise RuntimeError(f'Failed to initialize JWT secret due to a timeout '
38
+ f'when trying to acquire the lock at '
39
+ f'{JWT_SECRET_LOCK_PATH}. '
40
+ 'Please try again or manually remove the lock '
41
+ f'file if you believe it is stale.') from e
42
+
43
+
44
+ class TokenService:
45
+ """Service for managing JWT-based service account tokens."""
46
+
47
+ def __init__(self):
48
+ self.secret_key = None
49
+ self.init_lock = threading.Lock()
50
+
51
+ def _lazy_initialize(self):
52
+ if self.secret_key is not None:
53
+ return
54
+ with self.init_lock:
55
+ if self.secret_key is not None:
56
+ return
57
+ self.secret_key = self._get_or_generate_secret()
58
+
59
+ def _get_or_generate_secret(self) -> str:
60
+ """Get JWT secret from database or generate a new one."""
61
+
62
+ def _get_secret_from_db():
63
+ try:
64
+ db_secret = global_user_state.get_system_config(
65
+ JWT_SECRET_DB_KEY)
66
+ if db_secret:
67
+ logger.debug('Retrieved existing JWT secret from database')
68
+ return db_secret
69
+ except Exception as e: # pylint: disable=broad-except
70
+ logger.debug(f'Failed to get JWT secret from database: {e}')
71
+ return None
72
+
73
+ # Try to get from database (persistent across deployments)
74
+ token_from_db = _get_secret_from_db()
75
+ if token_from_db:
76
+ return token_from_db
77
+
78
+ with _jwt_secret_lock():
79
+ token_from_db = _get_secret_from_db()
80
+ if token_from_db:
81
+ return token_from_db
82
+ # Generate a new secret and store in database
83
+ new_secret = secrets.token_urlsafe(64)
84
+ try:
85
+ global_user_state.set_system_config(JWT_SECRET_DB_KEY,
86
+ new_secret)
87
+ logger.info(
88
+ 'Generated new JWT secret and stored in database. '
89
+ 'This secret will persist across API server restarts.')
90
+ except Exception as e: # pylint: disable=broad-except
91
+ logger.warning(
92
+ f'Failed to store new JWT secret in database: {e}. '
93
+ f'Using in-memory secret (tokens will not persist '
94
+ f'across restarts).')
95
+
96
+ return new_secret
97
+
98
+ def create_token(self,
99
+ creator_user_id: str,
100
+ service_account_user_id: str,
101
+ token_name: str,
102
+ expires_in_days: Optional[int] = None) -> Dict[str, Any]:
103
+ """Create a new JWT service account token.
104
+
105
+ Args:
106
+ creator_user_id: The creator's user hash
107
+ service_account_user_id: The service account's own user ID
108
+ token_name: Descriptive name for the token
109
+ expires_in_days: Optional expiration in days
110
+
111
+ Returns:
112
+ Dict containing token info including the JWT token
113
+ """
114
+ self._lazy_initialize()
115
+ now = datetime.datetime.now(datetime.timezone.utc)
116
+ token_id = secrets.token_urlsafe(12) # Shorter ID for JWT
117
+
118
+ # Build minimal JWT payload with single-character field names for
119
+ # compactness
120
+ payload = {
121
+ 'i': JWT_ISSUER, # Issuer (use constant)
122
+ 't': int(now.timestamp()), # Issued at (shortened from 'iat')
123
+ # Service account user ID (shortened from 'sub')
124
+ 'u': service_account_user_id,
125
+ 'k': token_id, # Token ID (shortened from 'token_id')
126
+ 'y': 'sa', # Type: service account (shortened from 'type')
127
+ }
128
+
129
+ # Add expiration if specified
130
+ expires_at = None
131
+ if expires_in_days:
132
+ exp_time = now + datetime.timedelta(days=expires_in_days)
133
+ payload['e'] = int(
134
+ exp_time.timestamp()) # Expiration (shortened from 'exp')
135
+ expires_at = int(exp_time.timestamp())
136
+
137
+ # Generate JWT
138
+ jwt_token = jwt.encode(payload,
139
+ self.secret_key,
140
+ algorithm=JWT_ALGORITHM)
141
+
142
+ # Create token with SkyPilot prefix
143
+ full_token = f'sky_{jwt_token}'
144
+
145
+ # Generate hash for database storage (we still hash the full token)
146
+ token_hash = hashlib.sha256(full_token.encode()).hexdigest()
147
+
148
+ return {
149
+ 'token_id': token_id,
150
+ 'token': full_token,
151
+ 'token_hash': token_hash,
152
+ 'creator_user_id': creator_user_id,
153
+ 'service_account_user_id': service_account_user_id,
154
+ 'token_name': token_name,
155
+ 'created_at': int(now.timestamp()),
156
+ 'expires_at': expires_at,
157
+ }
158
+
159
+ def verify_token(self, token: str) -> Optional[Dict[str, Any]]:
160
+ """Verify and decode a JWT token.
161
+
162
+ Args:
163
+ token: The full token (with sky_ prefix)
164
+
165
+ Returns:
166
+ Decoded token payload or None if invalid
167
+ """
168
+ self._lazy_initialize()
169
+ if not token.startswith('sky_'):
170
+ return None
171
+
172
+ # Remove the sky_ prefix
173
+ jwt_token = token[4:]
174
+
175
+ try:
176
+ # Decode and verify JWT (without issuer verification)
177
+ payload = jwt.decode(jwt_token,
178
+ self.secret_key,
179
+ algorithms=[JWT_ALGORITHM])
180
+
181
+ # Manually verify issuer using our shortened field name
182
+ token_issuer = payload.get('i')
183
+ if token_issuer != JWT_ISSUER:
184
+ logger.warning(f'Invalid token issuer: {token_issuer}')
185
+ return None
186
+
187
+ # Verify token type
188
+ token_type = payload.get('y')
189
+ if token_type != 'sa':
190
+ logger.warning(f'Invalid token type: {token_type}')
191
+ return None
192
+
193
+ # Convert shortened field names back to standard names for
194
+ # compatibility
195
+ normalized_payload = {
196
+ 'iss': payload.get('i'), # issuer
197
+ 'iat': payload.get('t'), # issued at
198
+ 'sub': payload.get('u'), # subject (service account user ID)
199
+ 'token_id': payload.get('k'), # token ID
200
+ 'type': 'service_account', # expand shortened type
201
+ }
202
+
203
+ # Add expiration if present
204
+ if 'e' in payload:
205
+ normalized_payload['exp'] = payload['e']
206
+
207
+ return normalized_payload
208
+
209
+ except jwt.ExpiredSignatureError:
210
+ logger.warning('Token has expired')
211
+ return None
212
+ except jwt.InvalidTokenError as e:
213
+ logger.warning(f'Invalid token: {e}')
214
+ return None
215
+
216
+
217
+ # Singleton instance
218
+ token_service = TokenService()
@@ -1,8 +1,9 @@
1
1
  """Accelerator registry."""
2
2
  import typing
3
- from typing import Optional
3
+ from typing import List, Optional
4
4
 
5
- from sky.clouds import service_catalog
5
+ from sky import catalog
6
+ from sky.catalog import common as catalog_common
6
7
  from sky.utils import rich_utils
7
8
  from sky.utils import ux_utils
8
9
 
@@ -34,7 +35,8 @@ if typing.TYPE_CHECKING:
34
35
 
35
36
  # Use a cached version of accelerators to cloud mapping, so that we don't have
36
37
  # to download and read the catalog file for every cloud locally.
37
- _accelerator_df = service_catalog.common.read_catalog('common/accelerators.csv')
38
+ _accelerator_df = catalog_common.read_catalog('common/accelerators.csv')
39
+ _memory_df = catalog_common.read_catalog('common/metadata.csv')
38
40
 
39
41
  # List of non-GPU accelerators that are supported by our backend for job queue
40
42
  # scheduling.
@@ -45,6 +47,32 @@ _SCHEDULABLE_NON_GPU_ACCELERATORS = [
45
47
  ]
46
48
 
47
49
 
50
+ def get_devices_by_memory(memory: float,
51
+ plus: bool = False,
52
+ manufacturer: Optional[str] = None) -> List[str]:
53
+ """Returns a list of device names that meet the memory and manufacturer
54
+ requirements.
55
+
56
+ Args:
57
+ memory: The minimum memory size in GB.
58
+ plus: If True, returns devices with memory >= memory, otherwise returns
59
+ devices with memory == memory.
60
+ manufacturer: The manufacturer of the GPU.
61
+ """
62
+
63
+ # Filter by memory requirements
64
+ if plus:
65
+ df = _memory_df[_memory_df['MemoryGB'] >= memory]
66
+ else:
67
+ df = _memory_df[_memory_df['MemoryGB'] == memory]
68
+
69
+ # Filter by manufacturer if specified
70
+ if manufacturer is not None:
71
+ df = df[df['Manufacturer'].str.lower() == manufacturer.lower()]
72
+
73
+ return df['GPU'].tolist()
74
+
75
+
48
76
  def is_schedulable_non_gpu_accelerator(accelerator_name: str) -> bool:
49
77
  """Returns if this accelerator is a 'schedulable' non-GPU accelerator."""
50
78
  for name in _SCHEDULABLE_NON_GPU_ACCELERATORS:
@@ -80,10 +108,12 @@ def canonicalize_accelerator_name(accelerator: str,
80
108
  if not names and cloud_str in ['Kubernetes', None]:
81
109
  with rich_utils.safe_status(
82
110
  ux_utils.spinner_message('Listing accelerators on Kubernetes')):
83
- searched = service_catalog.list_accelerators(
111
+ # Only search for Kubernetes to reduce the lookup cost.
112
+ # For other clouds, the catalog has been searched in previous steps.
113
+ searched = catalog.list_accelerators(
84
114
  name_filter=accelerator,
85
115
  case_sensitive=False,
86
- clouds=cloud_str,
116
+ clouds='Kubernetes',
87
117
  )
88
118
  names = list(searched.keys())
89
119
  if accelerator in names:
@@ -1,9 +1,10 @@
1
1
  """Admin policy utils."""
2
+ import contextlib
2
3
  import copy
3
4
  import importlib
4
- import os
5
- import tempfile
6
- from typing import Optional, Tuple, Union
5
+ import typing
6
+ from typing import Iterator, Optional, Tuple, Union
7
+ from urllib import parse as urlparse
7
8
 
8
9
  import colorama
9
10
 
@@ -13,25 +14,45 @@ from sky import exceptions
13
14
  from sky import sky_logging
14
15
  from sky import skypilot_config
15
16
  from sky import task as task_lib
17
+ from sky.server.requests import request_names
16
18
  from sky.utils import common_utils
17
19
  from sky.utils import config_utils
18
20
  from sky.utils import ux_utils
19
21
 
20
22
  logger = sky_logging.init_logger(__name__)
21
23
 
24
+ if typing.TYPE_CHECKING:
25
+ from sky import models
22
26
 
23
- def _get_policy_cls(
24
- policy: Optional[str]) -> Optional[admin_policy.AdminPolicy]:
27
+
28
+ def _is_url(policy_string: str) -> bool:
29
+ """Check if the policy string is a URL."""
30
+ try:
31
+ parsed = urlparse.urlparse(policy_string)
32
+ return parsed.scheme in ('http', 'https')
33
+ except Exception: # pylint: disable=broad-except
34
+ return False
35
+
36
+
37
+ def _get_policy_impl(
38
+ policy_location: Optional[str]
39
+ ) -> Optional[admin_policy.PolicyInterface]:
25
40
  """Gets admin-defined policy."""
26
- if policy is None:
41
+ if policy_location is None:
27
42
  return None
43
+
44
+ if _is_url(policy_location):
45
+ # Use the built-in URL policy class when an URL is specified.
46
+ return admin_policy.RestfulAdminPolicy(policy_location)
47
+
48
+ # Handle module path format
28
49
  try:
29
- module_path, class_name = policy.rsplit('.', 1)
50
+ module_path, class_name = policy_location.rsplit('.', 1)
30
51
  module = importlib.import_module(module_path)
31
52
  except ImportError as e:
32
53
  with ux_utils.print_exception_no_traceback():
33
54
  raise ImportError(
34
- f'Failed to import policy module: {policy}. '
55
+ f'Failed to import policy module: {policy_location}. '
35
56
  'Please check if the module is installed in your Python '
36
57
  'environment.') from e
37
58
 
@@ -43,19 +64,48 @@ def _get_policy_cls(
43
64
  f'Could not find {class_name} class in module {module_path}. '
44
65
  'Please check with your policy admin for details.') from e
45
66
 
46
- # Check if the module implements the AdminPolicy interface.
67
+ # Currently we only allow users to define subclass of AdminPolicy
68
+ # instead of inheriting from PolicyInterface or PolicyTemplate.
47
69
  if not issubclass(policy_cls, admin_policy.AdminPolicy):
48
70
  with ux_utils.print_exception_no_traceback():
49
71
  raise ValueError(
50
- f'Policy class {policy!r} does not implement the AdminPolicy '
51
- 'interface. Please check with your policy admin for details.')
52
- return policy_cls
72
+ f'Policy class {policy_cls!r} does not implement the '
73
+ 'AdminPolicy interface. Please check with your policy admin '
74
+ 'for details.')
75
+ return policy_cls()
76
+
77
+
78
+ @contextlib.contextmanager
79
+ def apply_and_use_config_in_current_request(
80
+ entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
81
+ request_name: request_names.AdminPolicyRequestName,
82
+ request_options: Optional[admin_policy.RequestOptions] = None,
83
+ at_client_side: bool = False,
84
+ ) -> Iterator['dag_lib.Dag']:
85
+ """Applies an admin policy and override SkyPilot config for current request
86
+
87
+ This is a helper function of `apply()` that applies an admin policy and
88
+ overrides the SkyPilot config for the current request as a context manager.
89
+ The original SkyPilot config will be restored when the context manager is
90
+ exited.
91
+
92
+ Refer to `apply()` for more details.
93
+ """
94
+ original_config = skypilot_config.to_dict()
95
+ dag, mutated_config = apply(entrypoint, request_name, request_options,
96
+ at_client_side)
97
+ if mutated_config != original_config:
98
+ with skypilot_config.replace_skypilot_config(mutated_config):
99
+ yield dag
100
+ else:
101
+ yield dag
53
102
 
54
103
 
55
104
  def apply(
56
105
  entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
57
- use_mutated_config_in_current_request: bool = True,
106
+ request_name: request_names.AdminPolicyRequestName,
58
107
  request_options: Optional[admin_policy.RequestOptions] = None,
108
+ at_client_side: bool = False,
59
109
  ) -> Tuple['dag_lib.Dag', config_utils.Config]:
60
110
  """Applies an admin policy (if registered) to a DAG or a task.
61
111
 
@@ -79,29 +129,41 @@ def apply(
79
129
  else:
80
130
  dag = entrypoint
81
131
 
82
- policy = skypilot_config.get_nested(('admin_policy',), None)
83
- policy_cls = _get_policy_cls(policy)
84
- if policy_cls is None:
132
+ policy_location = skypilot_config.get_nested(('admin_policy',), None)
133
+ policy = _get_policy_impl(policy_location)
134
+ if policy is None:
85
135
  return dag, skypilot_config.to_dict()
86
136
 
87
- logger.info(f'Applying policy: {policy}')
88
- original_config = skypilot_config.to_dict()
89
- config = copy.deepcopy(original_config)
137
+ user = None
138
+ if at_client_side:
139
+ logger.info(f'Applying client admin policy: {policy}')
140
+ else:
141
+ # When being called by the server, the middleware has set the
142
+ # current user and this information is available at this point.
143
+ user = common_utils.get_current_user()
144
+ logger.info(f'Applying server admin policy: {policy}')
145
+ config = copy.deepcopy(skypilot_config.to_dict())
90
146
  mutated_dag = dag_lib.Dag()
91
147
  mutated_dag.name = dag.name
92
148
 
93
149
  mutated_config = None
94
150
  for task in dag.tasks:
95
- user_request = admin_policy.UserRequest(task, config, request_options)
151
+ user_request = admin_policy.UserRequest(task, config, request_name,
152
+ request_options, at_client_side,
153
+ user)
96
154
  try:
97
- mutated_user_request = policy_cls.validate_and_mutate(user_request)
155
+ mutated_user_request = policy.apply(user_request)
156
+ # Avoid duplicate exception wrapping.
157
+ except exceptions.UserRequestRejectedByPolicy as e:
158
+ with ux_utils.print_exception_no_traceback():
159
+ raise e
98
160
  except Exception as e: # pylint: disable=broad-except
99
161
  with ux_utils.print_exception_no_traceback():
100
162
  raise exceptions.UserRequestRejectedByPolicy(
101
163
  f'{colorama.Fore.RED}User request rejected by policy '
102
164
  f'{policy!r}{colorama.Fore.RESET}: '
103
165
  f'{common_utils.format_exception(e, use_bracket=True)}'
104
- ) from e
166
+ ) from None
105
167
  if mutated_config is None:
106
168
  mutated_config = mutated_user_request.skypilot_config
107
169
  else:
@@ -126,22 +188,6 @@ def apply(
126
188
  mutated_dag.graph.add_edge(mutated_dag.tasks[u_idx],
127
189
  mutated_dag.tasks[v_idx])
128
190
 
129
- if (use_mutated_config_in_current_request and
130
- original_config != mutated_config):
131
- with tempfile.NamedTemporaryFile(
132
- delete=False,
133
- mode='w',
134
- prefix='policy-mutated-skypilot-config-',
135
- suffix='.yaml') as temp_file:
136
-
137
- common_utils.dump_yaml(temp_file.name, dict(**mutated_config))
138
- os.environ[skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = temp_file.name
139
- logger.debug(f'Updated SkyPilot config: {temp_file.name}')
140
- # TODO(zhwu): This is not a clean way to update the SkyPilot config,
141
- # because we are resetting the global context for a single DAG,
142
- # which is conceptually weird.
143
- importlib.reload(skypilot_config)
144
-
145
191
  logger.debug(f'Mutated user request: {mutated_user_request}')
146
192
  mutated_dag.policy_applied = True
147
193
  return mutated_dag, mutated_config
sky/utils/annotations.py CHANGED
@@ -1,14 +1,20 @@
1
1
  """Annotations for public APIs."""
2
2
 
3
3
  import functools
4
- from typing import Callable, Literal
4
+ from typing import Callable, Literal, TypeVar
5
+
6
+ import cachetools
7
+ from typing_extensions import ParamSpec
5
8
 
6
9
  # Whether the current process is a SkyPilot API server process.
7
10
  is_on_api_server = True
8
- FUNCTIONS_NEED_RELOAD_CACHE = []
11
+ _FUNCTIONS_NEED_RELOAD_CACHE = []
12
+
13
+ T = TypeVar('T')
14
+ P = ParamSpec('P')
9
15
 
10
16
 
11
- def client_api(func):
17
+ def client_api(func: Callable[P, T]) -> Callable[P, T]:
12
18
  """Mark a function as a client-side API.
13
19
 
14
20
  Code invoked by server-side functions will find annotations.is_on_api_server
@@ -38,14 +44,41 @@ def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
38
44
  lru_cache_kwargs: Keyword arguments for functools.lru_cache.
39
45
  """
40
46
 
41
- def decorator(func: Callable) -> Callable:
47
+ def decorator(func: Callable[P, T]) -> Callable[P, T]:
42
48
  if scope == 'global':
43
49
  return functools.lru_cache(*lru_cache_args,
44
50
  **lru_cache_kwargs)(func)
45
51
  else:
46
52
  cached_func = functools.lru_cache(*lru_cache_args,
47
53
  **lru_cache_kwargs)(func)
48
- FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
54
+ _FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
55
+ return cached_func
56
+
57
+ return decorator
58
+
59
+
60
+ def ttl_cache(scope: Literal['global', 'request'], *ttl_cache_args,
61
+ **ttl_cache_kwargs) -> Callable:
62
+ """TTLCache decorator for functions.
63
+
64
+ This decorator allows us to track which functions need to be reloaded for a
65
+ new request using the scope argument.
66
+ """
67
+
68
+ def decorator(func: Callable[P, T]) -> Callable[P, T]:
69
+ if scope == 'global':
70
+ return cachetools.cached(
71
+ cachetools.TTLCache(*ttl_cache_args, **ttl_cache_kwargs))(func)
72
+ else:
73
+ cached_func = cachetools.cached(
74
+ cachetools.TTLCache(*ttl_cache_args, **ttl_cache_kwargs))(func)
75
+ _FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
49
76
  return cached_func
50
77
 
51
78
  return decorator
79
+
80
+
81
+ def clear_request_level_cache():
82
+ """Clear the request-level cache."""
83
+ for func in _FUNCTIONS_NEED_RELOAD_CACHE:
84
+ func.cache_clear()
@@ -0,0 +1,78 @@
1
+ """Asyncio utilities."""
2
+
3
+ import asyncio
4
+ import functools
5
+ from typing import Set
6
+
7
+ _background_tasks: Set[asyncio.Task] = set()
8
+
9
+
10
+ def shield(func):
11
+ """Shield the decorated async function from cancellation.
12
+
13
+ If the outer coroutine is cancelled, the inner decorated function
14
+ will be protected from cancellation by asyncio.shield(). And we will
15
+ maintain a reference to the the inner task to avoid it get GCed before
16
+ it is done.
17
+
18
+ For example, filelock.AsyncFileLock is not cancellation safe. The
19
+ following code:
20
+
21
+ async def fn_with_lock():
22
+ async with filelock.AsyncFileLock('lock'):
23
+ await asyncio.sleep(1)
24
+
25
+ is equivalent to:
26
+
27
+ # The lock may leak if the cancellation happens in
28
+ # lock.acquire() or lock.release()
29
+ async def fn_with_lock():
30
+ lock = filelock.AsyncFileLock('lock')
31
+ await lock.acquire()
32
+ try:
33
+ await asyncio.sleep(1)
34
+ finally:
35
+ await lock.release()
36
+
37
+ Shilding the function ensures there is no cancellation will happen in the
38
+ function, thus the lock will be released properly:
39
+
40
+ @shield
41
+ async def fn_with_lock()
42
+
43
+ Note that the resource acquisition and release should usually be protected
44
+ in one @shield block but not separately, e.g.:
45
+
46
+ lock = filelock.AsyncFileLock('lock')
47
+
48
+ @shield
49
+ async def acquire():
50
+ await lock.acquire()
51
+
52
+ @shield
53
+ async def release():
54
+ await lock.release()
55
+
56
+ async def fn_with_lock():
57
+ await acquire()
58
+ try:
59
+ do_something()
60
+ finally:
61
+ await release()
62
+
63
+ The above code is not safe because if `fn_with_lock` is cancelled,
64
+ `acquire()` and `release()` will be executed in the background
65
+ concurrently and causes race conditions.
66
+ """
67
+
68
+ @functools.wraps(func)
69
+ async def async_wrapper(*args, **kwargs):
70
+ task = asyncio.create_task(func(*args, **kwargs))
71
+ try:
72
+ return await asyncio.shield(task)
73
+ except asyncio.CancelledError:
74
+ _background_tasks.add(task)
75
+ task.add_done_callback(lambda _: _background_tasks.discard(task))
76
+ raise
77
+
78
+ return async_wrapper
sky/utils/atomic.py CHANGED
@@ -1,4 +1,4 @@
1
- """Atomic structures and utilties."""
1
+ """Atomic structures and utilities."""
2
2
 
3
3
  import threading
4
4