skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/users/permission.py CHANGED
@@ -3,7 +3,7 @@ import contextlib
3
3
  import hashlib
4
4
  import logging
5
5
  import os
6
- from typing import Generator, List
6
+ from typing import Generator, List, Optional
7
7
 
8
8
  import casbin
9
9
  import filelock
@@ -14,6 +14,7 @@ from sky import models
14
14
  from sky import sky_logging
15
15
  from sky.skylet import constants
16
16
  from sky.users import rbac
17
+ from sky.utils import annotations
17
18
  from sky.utils import common_utils
18
19
  from sky.utils.db import db_utils
19
20
 
@@ -27,14 +28,14 @@ logger = sky_logging.init_logger(__name__)
27
28
  POLICY_UPDATE_LOCK_PATH = os.path.expanduser('~/.sky/.policy_update.lock')
28
29
  POLICY_UPDATE_LOCK_TIMEOUT_SECONDS = 20
29
30
 
30
- _enforcer_instance = None
31
+ _enforcer_instance: Optional['PermissionService'] = None
31
32
 
32
33
 
33
34
  class PermissionService:
34
35
  """Permission service for SkyPilot API Server."""
35
36
 
36
37
  def __init__(self):
37
- self.enforcer = None
38
+ self.enforcer: Optional[casbin.Enforcer] = None
38
39
 
39
40
  def _lazy_initialize(self):
40
41
  if self.enforcer is not None:
@@ -42,7 +43,6 @@ class PermissionService:
42
43
  with _policy_lock():
43
44
  global _enforcer_instance
44
45
  if _enforcer_instance is None:
45
- _enforcer_instance = self
46
46
  engine = global_user_state.initialize_and_get_db()
47
47
  db_utils.add_all_tables_to_db_sqlalchemy(
48
48
  sqlalchemy_adapter.Base.metadata, engine)
@@ -52,11 +52,23 @@ class PermissionService:
52
52
  'model.conf')
53
53
  enforcer = casbin.Enforcer(model_path, adapter)
54
54
  self.enforcer = enforcer
55
+ # Only set the enforcer instance once the enforcer
56
+ # is successfully initialized, if we change it and then fail
57
+ # we will set it to None and all subsequent calls will fail.
58
+ _enforcer_instance = self
55
59
  self._maybe_initialize_policies()
56
60
  self._maybe_initialize_basic_auth_user()
57
61
  else:
62
+ assert _enforcer_instance is not None
58
63
  self.enforcer = _enforcer_instance.enforcer
59
64
 
65
+ def _ensure_enforcer(self) -> casbin.Enforcer:
66
+ """Ensure enforcer is initialized and return it."""
67
+ self._lazy_initialize()
68
+ assert self.enforcer is not None, (
69
+ 'Enforcer should be initialized after _lazy_initialize()')
70
+ return self.enforcer
71
+
60
72
  def _maybe_initialize_basic_auth_user(self) -> None:
61
73
  """Initialize basic auth user if it is enabled."""
62
74
  basic_auth = os.environ.get(constants.SKYPILOT_INITIAL_BASIC_AUTH)
@@ -72,9 +84,9 @@ class PermissionService:
72
84
  return
73
85
  global_user_state.add_or_update_user(
74
86
  models.User(id=user_hash, name=username, password=password))
75
- self.enforcer.add_grouping_policy(user_hash,
76
- rbac.RoleName.ADMIN.value)
77
- self.enforcer.save_policy()
87
+ enforcer = self._ensure_enforcer()
88
+ enforcer.add_grouping_policy(user_hash, rbac.RoleName.ADMIN.value)
89
+ enforcer.save_policy()
78
90
  logger.info(f'Basic auth user {username} initialized')
79
91
 
80
92
  def _maybe_initialize_policies(self) -> None:
@@ -86,7 +98,8 @@ class PermissionService:
86
98
 
87
99
  # Check if policies are already initialized by looking for existing
88
100
  # permission policies in the enforcer
89
- existing_policies = self.enforcer.get_policy()
101
+ enforcer = self._ensure_enforcer()
102
+ existing_policies = enforcer.get_policy()
90
103
 
91
104
  # If we already have policies for the expected roles, skip
92
105
  # initialization
@@ -123,7 +136,7 @@ class PermissionService:
123
136
  logger.debug('Policies not found or incomplete, initializing...')
124
137
  # Only clear p policies (permission policies),
125
138
  # keep g policies (role policies)
126
- self.enforcer.remove_filtered_policy(0)
139
+ enforcer.remove_filtered_policy(0)
127
140
  for role, permissions in role_permissions.items():
128
141
  if permissions['permissions'] and 'blocklist' in permissions[
129
142
  'permissions']:
@@ -133,14 +146,14 @@ class PermissionService:
133
146
  method = item['method']
134
147
  logger.debug(f'Adding role policy: role={role}, '
135
148
  f'path={path}, method={method}')
136
- self.enforcer.add_policy(role, path, method)
149
+ enforcer.add_policy(role, path, method)
137
150
  policy_updated = True
138
151
 
139
152
  for workspace_name, users in workspace_policy_permissions.items():
140
153
  for user in users:
141
154
  logger.debug(f'Initializing workspace policy: user={user}, '
142
155
  f'workspace={workspace_name}')
143
- self.enforcer.add_policy(user, workspace_name, '*')
156
+ enforcer.add_policy(user, workspace_name, '*')
144
157
  policy_updated = True
145
158
  logger.debug('Policies initialized successfully')
146
159
  else:
@@ -153,7 +166,7 @@ class PermissionService:
153
166
  policy_updated = policy_updated or user_added
154
167
 
155
168
  if policy_updated:
156
- self.enforcer.save_policy()
169
+ enforcer.save_policy()
157
170
 
158
171
  def add_user_if_not_exists(self, user_id: str) -> None:
159
172
  """Add user role relationship."""
@@ -167,34 +180,35 @@ class PermissionService:
167
180
  Returns:
168
181
  True if the user was added, False otherwise.
169
182
  """
170
- user_roles = self.enforcer.get_roles_for_user(user_id)
183
+ enforcer = self._ensure_enforcer()
184
+ user_roles = enforcer.get_roles_for_user(user_id)
171
185
  if not user_roles:
172
- self.enforcer.add_grouping_policy(user_id, rbac.get_default_role())
186
+ enforcer.add_grouping_policy(user_id, rbac.get_default_role())
173
187
  return True
174
188
  return False
175
189
 
176
190
  def delete_user(self, user_id: str) -> None:
177
191
  """Delete user role relationship."""
178
- self._lazy_initialize()
179
192
  with _policy_lock():
180
193
  # Get current roles
181
194
  self._load_policy_no_lock()
182
195
  # Avoid calling get_user_roles, as it will require the lock.
183
- current_roles = self.enforcer.get_roles_for_user(user_id)
196
+ enforcer = self._ensure_enforcer()
197
+ current_roles = enforcer.get_roles_for_user(user_id)
184
198
  if not current_roles:
185
199
  logger.debug(f'User {user_id} has no roles')
186
200
  return
187
- self.enforcer.remove_grouping_policy(user_id, current_roles[0])
188
- self.enforcer.save_policy()
201
+ enforcer.remove_grouping_policy(user_id, current_roles[0])
202
+ enforcer.save_policy()
189
203
 
190
204
  def update_role(self, user_id: str, new_role: str) -> None:
191
205
  """Update user role relationship."""
192
- self._lazy_initialize()
193
206
  with _policy_lock():
194
207
  # Get current roles
195
208
  self._load_policy_no_lock()
196
209
  # Avoid calling get_user_roles, as it will require the lock.
197
- current_roles = self.enforcer.get_roles_for_user(user_id)
210
+ enforcer = self._ensure_enforcer()
211
+ current_roles = enforcer.get_roles_for_user(user_id)
198
212
  if not current_roles:
199
213
  logger.debug(f'User {user_id} has no roles')
200
214
  else:
@@ -203,11 +217,11 @@ class PermissionService:
203
217
  if current_role == new_role:
204
218
  logger.debug(f'User {user_id} already has role {new_role}')
205
219
  return
206
- self.enforcer.remove_grouping_policy(user_id, current_role)
220
+ enforcer.remove_grouping_policy(user_id, current_role)
207
221
 
208
222
  # Update user role
209
- self.enforcer.add_grouping_policy(user_id, new_role)
210
- self.enforcer.save_policy()
223
+ enforcer.add_grouping_policy(user_id, new_role)
224
+ enforcer.save_policy()
211
225
 
212
226
  def get_user_roles(self, user_id: str) -> List[str]:
213
227
  """Get all roles for a user.
@@ -222,15 +236,15 @@ class PermissionService:
222
236
  Returns:
223
237
  A list of role names that the user has.
224
238
  """
225
- self._lazy_initialize()
226
239
  self._load_policy_no_lock()
227
- return self.enforcer.get_roles_for_user(user_id)
240
+ enforcer = self._ensure_enforcer()
241
+ return enforcer.get_roles_for_user(user_id)
228
242
 
229
243
  def get_users_for_role(self, role: str) -> List[str]:
230
244
  """Get all users for a role."""
231
- self._lazy_initialize()
232
245
  self._load_policy_no_lock()
233
- return self.enforcer.get_users_for_role(role)
246
+ enforcer = self._ensure_enforcer()
247
+ return enforcer.get_users_for_role(role)
234
248
 
235
249
  def check_endpoint_permission(self, user_id: str, path: str,
236
250
  method: str) -> bool:
@@ -241,19 +255,22 @@ class PermissionService:
241
255
  # it is a hot path in every request. It is ok to have a stale policy,
242
256
  # as long as it is eventually consistent.
243
257
  # self._load_policy_no_lock()
244
- self._lazy_initialize()
245
- return self.enforcer.enforce(user_id, path, method)
258
+ enforcer = self._ensure_enforcer()
259
+ return enforcer.enforce(user_id, path, method)
246
260
 
247
261
  def _load_policy_no_lock(self):
248
262
  """Load policy from storage."""
249
- self.enforcer.load_policy()
263
+ enforcer = self._ensure_enforcer()
264
+ enforcer.load_policy()
250
265
 
251
266
  def load_policy(self):
252
267
  """Load policy from storage with lock."""
253
- self._lazy_initialize()
254
268
  with _policy_lock():
255
269
  self._load_policy_no_lock()
256
270
 
271
+ # Right now, not a lot of users are using multiple workspaces,
272
+ # so 5 should be more than enough.
273
+ @annotations.lru_cache(scope='request', maxsize=5)
257
274
  def check_workspace_permission(self, user_id: str,
258
275
  workspace_name: str) -> bool:
259
276
  """Check workspace permission.
@@ -266,7 +283,6 @@ class PermissionService:
266
283
  For public workspaces, the permission is granted via a wildcard policy
267
284
  ('*').
268
285
  """
269
- self._lazy_initialize()
270
286
  if os.getenv(constants.ENV_VAR_IS_SKYPILOT_SERVER) is None:
271
287
  # When it is not on API server, we allow all users to access all
272
288
  # workspaces, as the workspace check has been done on API server.
@@ -279,7 +295,8 @@ class PermissionService:
279
295
  # r.act == p.act
280
296
  # This means if there's a policy ('*', workspace_name, '*'), it will
281
297
  # match any user
282
- result = self.enforcer.enforce(user_id, workspace_name, '*')
298
+ enforcer = self._ensure_enforcer()
299
+ result = enforcer.enforce(user_id, workspace_name, '*')
283
300
  logger.debug(f'Workspace permission check: user={user_id}, '
284
301
  f'workspace={workspace_name}, result={result}')
285
302
  return result
@@ -323,13 +340,13 @@ class PermissionService:
323
340
  For public workspaces, this should be ['*'].
324
341
  For private workspaces, this should be specific user IDs.
325
342
  """
326
- self._lazy_initialize()
327
343
  with _policy_lock():
344
+ enforcer = self._ensure_enforcer()
328
345
  for user in users:
329
346
  logger.debug(f'Adding workspace policy: user={user}, '
330
347
  f'workspace={workspace_name}')
331
- self.enforcer.add_policy(user, workspace_name, '*')
332
- self.enforcer.save_policy()
348
+ enforcer.add_policy(user, workspace_name, '*')
349
+ enforcer.save_policy()
333
350
 
334
351
  def update_workspace_policy(self, workspace_name: str,
335
352
  users: List[str]) -> None:
@@ -341,24 +358,24 @@ class PermissionService:
341
358
  For public workspaces, this should be ['*'].
342
359
  For private workspaces, this should be specific user IDs.
343
360
  """
344
- self._lazy_initialize()
345
361
  with _policy_lock():
346
362
  self._load_policy_no_lock()
363
+ enforcer = self._ensure_enforcer()
347
364
  # Remove all existing policies for this workspace
348
- self.enforcer.remove_filtered_policy(1, workspace_name)
365
+ enforcer.remove_filtered_policy(1, workspace_name)
349
366
  # Add new policies
350
367
  for user in users:
351
368
  logger.debug(f'Updating workspace policy: user={user}, '
352
369
  f'workspace={workspace_name}')
353
- self.enforcer.add_policy(user, workspace_name, '*')
354
- self.enforcer.save_policy()
370
+ enforcer.add_policy(user, workspace_name, '*')
371
+ enforcer.save_policy()
355
372
 
356
373
  def remove_workspace_policy(self, workspace_name: str) -> None:
357
374
  """Remove workspace policy."""
358
- self._lazy_initialize()
359
375
  with _policy_lock():
360
- self.enforcer.remove_filtered_policy(1, workspace_name)
361
- self.enforcer.save_policy()
376
+ enforcer = self._ensure_enforcer()
377
+ enforcer.remove_filtered_policy(1, workspace_name)
378
+ enforcer.save_policy()
362
379
 
363
380
 
364
381
  @contextlib.contextmanager
@@ -3,6 +3,7 @@ import typing
3
3
  from typing import List, Optional
4
4
 
5
5
  from sky import catalog
6
+ from sky.catalog import common as catalog_common
6
7
  from sky.utils import rich_utils
7
8
  from sky.utils import ux_utils
8
9
 
@@ -34,8 +35,8 @@ if typing.TYPE_CHECKING:
34
35
 
35
36
  # Use a cached version of accelerators to cloud mapping, so that we don't have
36
37
  # to download and read the catalog file for every cloud locally.
37
- _accelerator_df = catalog.common.read_catalog('common/accelerators.csv')
38
- _memory_df = catalog.common.read_catalog('common/metadata.csv')
38
+ _accelerator_df = catalog_common.read_catalog('common/accelerators.csv')
39
+ _memory_df = catalog_common.read_catalog('common/metadata.csv')
39
40
 
40
41
  # List of non-GPU accelerators that are supported by our backend for job queue
41
42
  # scheduling.
@@ -107,10 +108,12 @@ def canonicalize_accelerator_name(accelerator: str,
107
108
  if not names and cloud_str in ['Kubernetes', None]:
108
109
  with rich_utils.safe_status(
109
110
  ux_utils.spinner_message('Listing accelerators on Kubernetes')):
111
+ # Only search for Kubernetes to reduce the lookup cost.
112
+ # For other clouds, the catalog has been searched in previous steps.
110
113
  searched = catalog.list_accelerators(
111
114
  name_filter=accelerator,
112
115
  case_sensitive=False,
113
- clouds=cloud_str,
116
+ clouds='Kubernetes',
114
117
  )
115
118
  names = list(searched.keys())
116
119
  if accelerator in names:
@@ -2,8 +2,9 @@
2
2
  import contextlib
3
3
  import copy
4
4
  import importlib
5
+ import typing
5
6
  from typing import Iterator, Optional, Tuple, Union
6
- import urllib.parse
7
+ from urllib import parse as urlparse
7
8
 
8
9
  import colorama
9
10
 
@@ -13,17 +14,21 @@ from sky import exceptions
13
14
  from sky import sky_logging
14
15
  from sky import skypilot_config
15
16
  from sky import task as task_lib
17
+ from sky.server.requests import request_names
16
18
  from sky.utils import common_utils
17
19
  from sky.utils import config_utils
18
20
  from sky.utils import ux_utils
19
21
 
20
22
  logger = sky_logging.init_logger(__name__)
21
23
 
24
+ if typing.TYPE_CHECKING:
25
+ from sky import models
26
+
22
27
 
23
28
  def _is_url(policy_string: str) -> bool:
24
29
  """Check if the policy string is a URL."""
25
30
  try:
26
- parsed = urllib.parse.urlparse(policy_string)
31
+ parsed = urlparse.urlparse(policy_string)
27
32
  return parsed.scheme in ('http', 'https')
28
33
  except Exception: # pylint: disable=broad-except
29
34
  return False
@@ -73,6 +78,7 @@ def _get_policy_impl(
73
78
  @contextlib.contextmanager
74
79
  def apply_and_use_config_in_current_request(
75
80
  entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
81
+ request_name: request_names.AdminPolicyRequestName,
76
82
  request_options: Optional[admin_policy.RequestOptions] = None,
77
83
  at_client_side: bool = False,
78
84
  ) -> Iterator['dag_lib.Dag']:
@@ -86,7 +92,8 @@ def apply_and_use_config_in_current_request(
86
92
  Refer to `apply()` for more details.
87
93
  """
88
94
  original_config = skypilot_config.to_dict()
89
- dag, mutated_config = apply(entrypoint, request_options, at_client_side)
95
+ dag, mutated_config = apply(entrypoint, request_name, request_options,
96
+ at_client_side)
90
97
  if mutated_config != original_config:
91
98
  with skypilot_config.replace_skypilot_config(mutated_config):
92
99
  yield dag
@@ -96,6 +103,7 @@ def apply_and_use_config_in_current_request(
96
103
 
97
104
  def apply(
98
105
  entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
106
+ request_name: request_names.AdminPolicyRequestName,
99
107
  request_options: Optional[admin_policy.RequestOptions] = None,
100
108
  at_client_side: bool = False,
101
109
  ) -> Tuple['dag_lib.Dag', config_utils.Config]:
@@ -126,9 +134,13 @@ def apply(
126
134
  if policy is None:
127
135
  return dag, skypilot_config.to_dict()
128
136
 
137
+ user = None
129
138
  if at_client_side:
130
139
  logger.info(f'Applying client admin policy: {policy}')
131
140
  else:
141
+ # When being called by the server, the middleware has set the
142
+ # current user and this information is available at this point.
143
+ user = common_utils.get_current_user()
132
144
  logger.info(f'Applying server admin policy: {policy}')
133
145
  config = copy.deepcopy(skypilot_config.to_dict())
134
146
  mutated_dag = dag_lib.Dag()
@@ -136,8 +148,9 @@ def apply(
136
148
 
137
149
  mutated_config = None
138
150
  for task in dag.tasks:
139
- user_request = admin_policy.UserRequest(task, config, request_options,
140
- at_client_side)
151
+ user_request = admin_policy.UserRequest(task, config, request_name,
152
+ request_options, at_client_side,
153
+ user)
141
154
  try:
142
155
  mutated_user_request = policy.apply(user_request)
143
156
  # Avoid duplicate exception wrapping.
sky/utils/annotations.py CHANGED
@@ -3,6 +3,7 @@
3
3
  import functools
4
4
  from typing import Callable, Literal, TypeVar
5
5
 
6
+ import cachetools
6
7
  from typing_extensions import ParamSpec
7
8
 
8
9
  # Whether the current process is a SkyPilot API server process.
@@ -56,6 +57,27 @@ def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
56
57
  return decorator
57
58
 
58
59
 
60
+ def ttl_cache(scope: Literal['global', 'request'], *ttl_cache_args,
61
+ **ttl_cache_kwargs) -> Callable:
62
+ """TTLCache decorator for functions.
63
+
64
+ This decorator allows us to track which functions need to be reloaded for a
65
+ new request using the scope argument.
66
+ """
67
+
68
+ def decorator(func: Callable[P, T]) -> Callable[P, T]:
69
+ if scope == 'global':
70
+ return cachetools.cached(
71
+ cachetools.TTLCache(*ttl_cache_args, **ttl_cache_kwargs))(func)
72
+ else:
73
+ cached_func = cachetools.cached(
74
+ cachetools.TTLCache(*ttl_cache_args, **ttl_cache_kwargs))(func)
75
+ _FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
76
+ return cached_func
77
+
78
+ return decorator
79
+
80
+
59
81
  def clear_request_level_cache():
60
82
  """Clear the request-level cache."""
61
83
  for func in _FUNCTIONS_NEED_RELOAD_CACHE:
@@ -0,0 +1,78 @@
1
+ """Asyncio utilities."""
2
+
3
+ import asyncio
4
+ import functools
5
+ from typing import Set
6
+
7
+ _background_tasks: Set[asyncio.Task] = set()
8
+
9
+
10
+ def shield(func):
11
+ """Shield the decorated async function from cancellation.
12
+
13
+ If the outer coroutine is cancelled, the inner decorated function
14
+ will be protected from cancellation by asyncio.shield(). And we will
15
+ maintain a reference to the the inner task to avoid it get GCed before
16
+ it is done.
17
+
18
+ For example, filelock.AsyncFileLock is not cancellation safe. The
19
+ following code:
20
+
21
+ async def fn_with_lock():
22
+ async with filelock.AsyncFileLock('lock'):
23
+ await asyncio.sleep(1)
24
+
25
+ is equivalent to:
26
+
27
+ # The lock may leak if the cancellation happens in
28
+ # lock.acquire() or lock.release()
29
+ async def fn_with_lock():
30
+ lock = filelock.AsyncFileLock('lock')
31
+ await lock.acquire()
32
+ try:
33
+ await asyncio.sleep(1)
34
+ finally:
35
+ await lock.release()
36
+
37
+ Shilding the function ensures there is no cancellation will happen in the
38
+ function, thus the lock will be released properly:
39
+
40
+ @shield
41
+ async def fn_with_lock()
42
+
43
+ Note that the resource acquisition and release should usually be protected
44
+ in one @shield block but not separately, e.g.:
45
+
46
+ lock = filelock.AsyncFileLock('lock')
47
+
48
+ @shield
49
+ async def acquire():
50
+ await lock.acquire()
51
+
52
+ @shield
53
+ async def release():
54
+ await lock.release()
55
+
56
+ async def fn_with_lock():
57
+ await acquire()
58
+ try:
59
+ do_something()
60
+ finally:
61
+ await release()
62
+
63
+ The above code is not safe because if `fn_with_lock` is cancelled,
64
+ `acquire()` and `release()` will be executed in the background
65
+ concurrently and causes race conditions.
66
+ """
67
+
68
+ @functools.wraps(func)
69
+ async def async_wrapper(*args, **kwargs):
70
+ task = asyncio.create_task(func(*args, **kwargs))
71
+ try:
72
+ return await asyncio.shield(task)
73
+ except asyncio.CancelledError:
74
+ _background_tasks.add(task)
75
+ task.add_done_callback(lambda _: _background_tasks.discard(task))
76
+ raise
77
+
78
+ return async_wrapper
sky/utils/atomic.py CHANGED
@@ -1,4 +1,4 @@
1
- """Atomic structures and utilties."""
1
+ """Atomic structures and utilities."""
2
2
 
3
3
  import threading
4
4
 
@@ -0,0 +1,153 @@
1
+ """Utils for managing SkyPilot SSH key pairs."""
2
+
3
+ import functools
4
+ import os
5
+ from typing import Tuple
6
+
7
+ import filelock
8
+
9
+ from sky import global_user_state
10
+ from sky import sky_logging
11
+ from sky.utils import common_utils
12
+
13
+ logger = sky_logging.init_logger(__name__)
14
+
15
+ MAX_TRIALS = 64
16
+ # TODO(zhwu): Support user specified key pair.
17
+ # We intentionally not have the ssh key pair to be stored in
18
+ # ~/.sky/api_server/clients, i.e. sky.server.common.API_SERVER_CLIENT_DIR,
19
+ # because ssh key pair need to persist across API server restarts, while
20
+ # the former dir is ephemeral.
21
+ _SSH_KEY_PATH_PREFIX = '~/.sky/clients/{user_hash}/ssh'
22
+
23
+
24
+ def get_ssh_key_and_lock_path(user_hash: str) -> Tuple[str, str, str]:
25
+ user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
26
+
27
+ os.makedirs(os.path.expanduser(user_ssh_key_prefix),
28
+ exist_ok=True,
29
+ mode=0o700)
30
+ private_key_path = os.path.join(user_ssh_key_prefix, 'sky-key')
31
+ public_key_path = os.path.join(user_ssh_key_prefix, 'sky-key.pub')
32
+ lock_path = os.path.join(user_ssh_key_prefix, '.__internal-sky-key.lock')
33
+ return private_key_path, public_key_path, lock_path
34
+
35
+
36
+ def _generate_rsa_key_pair() -> Tuple[str, str]:
37
+ # Keep the import of the cryptography local to avoid expensive
38
+ # third-party imports when not needed.
39
+ # pylint: disable=import-outside-toplevel
40
+ from cryptography.hazmat.backends import default_backend
41
+ from cryptography.hazmat.primitives import serialization
42
+ from cryptography.hazmat.primitives.asymmetric import rsa
43
+
44
+ key = rsa.generate_private_key(backend=default_backend(),
45
+ public_exponent=65537,
46
+ key_size=2048)
47
+
48
+ private_key = key.private_bytes(
49
+ encoding=serialization.Encoding.PEM,
50
+ format=serialization.PrivateFormat.TraditionalOpenSSL,
51
+ encryption_algorithm=serialization.NoEncryption()).decode(
52
+ 'utf-8').strip()
53
+
54
+ public_key = key.public_key().public_bytes(
55
+ serialization.Encoding.OpenSSH,
56
+ serialization.PublicFormat.OpenSSH).decode('utf-8').strip()
57
+
58
+ return public_key, private_key
59
+
60
+
61
+ def _save_key_pair(private_key_path: str, public_key_path: str,
62
+ private_key: str, public_key: str) -> None:
63
+ key_dir = os.path.dirname(private_key_path)
64
+ os.makedirs(key_dir, exist_ok=True, mode=0o700)
65
+
66
+ with open(
67
+ private_key_path,
68
+ 'w',
69
+ encoding='utf-8',
70
+ opener=functools.partial(os.open, mode=0o600),
71
+ ) as f:
72
+ f.write(private_key)
73
+
74
+ with open(public_key_path,
75
+ 'w',
76
+ encoding='utf-8',
77
+ opener=functools.partial(os.open, mode=0o644)) as f:
78
+ f.write(public_key)
79
+
80
+
81
+ def get_or_generate_keys() -> Tuple[str, str]:
82
+ """Returns the absolute private and public key paths."""
83
+ user_hash = common_utils.get_user_hash()
84
+ private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path(
85
+ user_hash)
86
+ private_key_path = os.path.expanduser(private_key_path)
87
+ public_key_path = os.path.expanduser(public_key_path)
88
+ lock_path = os.path.expanduser(lock_path)
89
+
90
+ lock_dir = os.path.dirname(lock_path)
91
+ # We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
92
+ # as the ssh configs will be written to this folder as well in
93
+ # backend_utils.SSHConfigHelper
94
+ os.makedirs(lock_dir, exist_ok=True, mode=0o700)
95
+ with filelock.FileLock(lock_path, timeout=10):
96
+ if not os.path.exists(private_key_path):
97
+ ssh_public_key, ssh_private_key, exists = (
98
+ global_user_state.get_ssh_keys(user_hash))
99
+ if not exists:
100
+ ssh_public_key, ssh_private_key = _generate_rsa_key_pair()
101
+ global_user_state.set_ssh_keys(user_hash, ssh_public_key,
102
+ ssh_private_key)
103
+ _save_key_pair(private_key_path, public_key_path, ssh_private_key,
104
+ ssh_public_key)
105
+ assert os.path.exists(public_key_path), (
106
+ 'Private key found, but associated public key '
107
+ f'{public_key_path} does not exist.')
108
+ return private_key_path, public_key_path
109
+
110
+
111
+ def create_ssh_key_files_from_db(private_key_path: str) -> bool:
112
+ """Creates the ssh key files from the database.
113
+
114
+ Returns:
115
+ True if the ssh key files are created successfully, False otherwise.
116
+ """
117
+ # Assume private key path is in the format of
118
+ # ~/.sky/clients/<user_hash>/ssh/sky-key
119
+ separated_path = os.path.normpath(private_key_path).split(os.path.sep)
120
+ assert separated_path[-1] == 'sky-key'
121
+ assert separated_path[-2] == 'ssh'
122
+ user_hash = separated_path[-3]
123
+
124
+ private_key_path_generated, public_key_path, lock_path = (
125
+ get_ssh_key_and_lock_path(user_hash))
126
+ assert private_key_path == os.path.expanduser(private_key_path_generated), (
127
+ f'Private key path {private_key_path} does not '
128
+ 'match the generated path '
129
+ f'{os.path.expanduser(private_key_path_generated)}')
130
+ private_key_path = os.path.expanduser(private_key_path)
131
+ public_key_path = os.path.expanduser(public_key_path)
132
+ lock_path = os.path.expanduser(lock_path)
133
+ lock_dir = os.path.dirname(lock_path)
134
+
135
+ if os.path.exists(private_key_path) and os.path.exists(public_key_path):
136
+ return True
137
+ # We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
138
+ # as the ssh configs will be written to this folder as well in
139
+ # backend_utils.SSHConfigHelper
140
+ os.makedirs(lock_dir, exist_ok=True, mode=0o700)
141
+ with filelock.FileLock(lock_path, timeout=10):
142
+ if not os.path.exists(private_key_path):
143
+ ssh_public_key, ssh_private_key, exists = (
144
+ global_user_state.get_ssh_keys(user_hash))
145
+ if not exists:
146
+ logger.debug(f'SSH keys not found for user {user_hash}')
147
+ return False
148
+ _save_key_pair(private_key_path, public_key_path, ssh_private_key,
149
+ ssh_public_key)
150
+ assert os.path.exists(public_key_path), (
151
+ 'Private key found, but associated public key '
152
+ f'{public_key_path} does not exist.')
153
+ return True