skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -31,6 +31,7 @@ import time
31
31
  import typing
32
32
  from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
33
33
 
34
+ import psutil
34
35
  import setproctitle
35
36
 
36
37
  from sky import exceptions
@@ -38,14 +39,18 @@ from sky import global_user_state
38
39
  from sky import models
39
40
  from sky import sky_logging
40
41
  from sky import skypilot_config
42
+ from sky.metrics import utils as metrics_utils
41
43
  from sky.server import common as server_common
42
44
  from sky.server import config as server_config
43
45
  from sky.server import constants as server_constants
44
46
  from sky.server import metrics as metrics_lib
47
+ from sky.server import plugins
45
48
  from sky.server.requests import payloads
46
49
  from sky.server.requests import preconditions
47
50
  from sky.server.requests import process
51
+ from sky.server.requests import request_names
48
52
  from sky.server.requests import requests as api_requests
53
+ from sky.server.requests import threads
49
54
  from sky.server.requests.queues import local_queue
50
55
  from sky.server.requests.queues import mp_queue
51
56
  from sky.skylet import constants
@@ -79,6 +84,31 @@ logger = sky_logging.init_logger(__name__)
79
84
  # platforms, including macOS.
80
85
  multiprocessing.set_start_method('spawn', force=True)
81
86
 
87
+ # An upper limit of max threads for request execution per server process that
88
+ # unlikely to be reached to allow higher concurrency while still prevent the
89
+ # server process become overloaded.
90
+ _REQUEST_THREADS_LIMIT = 128
91
+
92
+ _REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
93
+ # A dedicated thread pool executor for synced requests execution in coroutine to
94
+ # avoid:
95
+ # 1. blocking the event loop;
96
+ # 2. exhausting the default thread pool executor of event loop;
97
+ _REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
98
+
99
+
100
+ def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
101
+ """Lazy init and return the request thread executor for current process."""
102
+ global _REQUEST_THREAD_EXECUTOR
103
+ if _REQUEST_THREAD_EXECUTOR is not None:
104
+ return _REQUEST_THREAD_EXECUTOR
105
+ with _REQUEST_THREAD_EXECUTOR_LOCK:
106
+ if _REQUEST_THREAD_EXECUTOR is None:
107
+ _REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
108
+ name='request_thread_executor',
109
+ max_workers=_REQUEST_THREADS_LIMIT)
110
+ return _REQUEST_THREAD_EXECUTOR
111
+
82
112
 
83
113
  class RequestQueue:
84
114
  """The queue for the requests, either redis or multiprocessing.
@@ -130,6 +160,12 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
130
160
  def executor_initializer(proc_group: str):
131
161
  setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
132
162
  f'{multiprocessing.current_process().pid}')
163
+ # Load plugins for executor process.
164
+ plugins.load_plugins(plugins.ExtensionContext())
165
+ # Executor never stops, unless the whole process is killed.
166
+ threading.Thread(target=metrics_lib.process_monitor,
167
+ args=(f'worker:{proc_group}', threading.Event()),
168
+ daemon=True).start()
133
169
 
134
170
 
135
171
  class RequestWorker:
@@ -182,10 +218,11 @@ class RequestWorker:
182
218
  time.sleep(0.1)
183
219
  return
184
220
  request_id, ignore_return_value, _ = request_element
185
- request = api_requests.get_request(request_id)
221
+ request = api_requests.get_request(request_id, fields=['status'])
186
222
  assert request is not None, f'Request with ID {request_id} is None'
187
223
  if request.status == api_requests.RequestStatus.CANCELLED:
188
224
  return
225
+ del request
189
226
  logger.info(f'[{self}] Submitting request: {request_id}')
190
227
  # Start additional process to run the request, so that it can be
191
228
  # cancelled when requested by a user.
@@ -196,6 +233,12 @@ class RequestWorker:
196
233
  fut = executor.submit_until_success(
197
234
  _request_execution_wrapper, request_id, ignore_return_value,
198
235
  self.num_db_connections_per_worker)
236
+ # Decrement the free executor count when a request starts
237
+ if metrics_utils.METRICS_ENABLED:
238
+ if self.schedule_type == api_requests.ScheduleType.LONG:
239
+ metrics_utils.SKY_APISERVER_LONG_EXECUTORS.dec()
240
+ elif self.schedule_type == api_requests.ScheduleType.SHORT:
241
+ metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.dec()
199
242
  # Monitor the result of the request execution.
200
243
  threading.Thread(target=self.handle_task_result,
201
244
  args=(fut, request_element),
@@ -230,9 +273,23 @@ class RequestWorker:
230
273
  queue.put(request_element)
231
274
  except exceptions.ExecutionRetryableError as e:
232
275
  time.sleep(e.retry_wait_seconds)
276
+ # Reset the request status to PENDING so it can be picked up again.
277
+ # Assume retryable since the error is ExecutionRetryableError.
278
+ request_id, _, _ = request_element
279
+ with api_requests.update_request(request_id) as request_task:
280
+ assert request_task is not None, request_id
281
+ request_task.status = api_requests.RequestStatus.PENDING
233
282
  # Reschedule the request.
234
283
  queue = _get_queue(self.schedule_type)
235
284
  queue.put(request_element)
285
+ logger.info(f'Rescheduled request {request_id} for retry')
286
+ finally:
287
+ # Increment the free executor count when a request finishes
288
+ if metrics_utils.METRICS_ENABLED:
289
+ if self.schedule_type == api_requests.ScheduleType.LONG:
290
+ metrics_utils.SKY_APISERVER_LONG_EXECUTORS.inc()
291
+ elif self.schedule_type == api_requests.ScheduleType.SHORT:
292
+ metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.inc()
236
293
 
237
294
  def run(self) -> None:
238
295
  # Handle the SIGTERM signal to abort the executor process gracefully.
@@ -254,6 +311,16 @@ class RequestWorker:
254
311
  burst_workers=self.burstable_parallelism,
255
312
  initializer=executor_initializer,
256
313
  initargs=(proc_group,))
314
+ # Initialize the appropriate gauge for the number of free executors
315
+ total_executors = (self.garanteed_parallelism +
316
+ self.burstable_parallelism)
317
+ if metrics_utils.METRICS_ENABLED:
318
+ if self.schedule_type == api_requests.ScheduleType.LONG:
319
+ metrics_utils.SKY_APISERVER_LONG_EXECUTORS.set(
320
+ total_executors)
321
+ elif self.schedule_type == api_requests.ScheduleType.SHORT:
322
+ metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.set(
323
+ total_executors)
257
324
  while not self._cancel_event.is_set():
258
325
  self.process_request(executor, queue)
259
326
  # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
@@ -277,43 +344,56 @@ def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
277
344
 
278
345
  @contextlib.contextmanager
279
346
  def override_request_env_and_config(
280
- request_body: payloads.RequestBody,
281
- request_id: str) -> Generator[None, None, None]:
347
+ request_body: payloads.RequestBody, request_id: str,
348
+ request_name: str) -> Generator[None, None, None]:
282
349
  """Override the environment and SkyPilot config for a request."""
283
350
  original_env = os.environ.copy()
284
- # Unset SKYPILOT_DEBUG by default, to avoid the value set on the API server
285
- # affecting client requests. If set on the client side, it will be
286
- # overridden by the request body.
287
- os.environ.pop('SKYPILOT_DEBUG', None)
288
- os.environ.update(request_body.env_vars)
289
- # Note: may be overridden by AuthProxyMiddleware.
290
- # TODO(zhwu): we need to make the entire request a context available to the
291
- # entire request execution, so that we can access info like user through
292
- # the execution.
293
- user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
294
- name=request_body.env_vars[constants.USER_ENV_VAR])
295
- global_user_state.add_or_update_user(user)
296
- # Refetch the user to get the latest user info, including the created_at
297
- # field.
298
- user = global_user_state.get_user(user.id)
299
-
300
- # Force color to be enabled.
301
- os.environ['CLICOLOR_FORCE'] = '1'
302
- server_common.reload_for_new_request(
303
- client_entrypoint=request_body.entrypoint,
304
- client_command=request_body.entrypoint_command,
305
- using_remote_api_server=request_body.using_remote_api_server,
306
- user=user,
307
- request_id=request_id)
308
351
  try:
352
+ # Unset SKYPILOT_DEBUG by default, to avoid the value set on the API
353
+ # server affecting client requests. If set on the client side, it will
354
+ # be overridden by the request body.
355
+ os.environ.pop('SKYPILOT_DEBUG', None)
356
+ # Remove the db connection uri from client supplied env vars, as the
357
+ # client should not set the db string on server side.
358
+ request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
359
+ os.environ.update(request_body.env_vars)
360
+ # Note: may be overridden by AuthProxyMiddleware.
361
+ # TODO(zhwu): we need to make the entire request a context available to
362
+ # the entire request execution, so that we can access info like user
363
+ # through the execution.
364
+ user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
365
+ name=request_body.env_vars[constants.USER_ENV_VAR])
366
+ _, user = global_user_state.add_or_update_user(user, return_user=True)
367
+
368
+ # Force color to be enabled.
369
+ os.environ['CLICOLOR_FORCE'] = '1'
370
+ server_common.reload_for_new_request(
371
+ client_entrypoint=request_body.entrypoint,
372
+ client_command=request_body.entrypoint_command,
373
+ using_remote_api_server=request_body.using_remote_api_server,
374
+ user=user,
375
+ request_id=request_id)
309
376
  logger.debug(
310
377
  f'override path: {request_body.override_skypilot_config_path}')
311
378
  with skypilot_config.override_skypilot_config(
312
379
  request_body.override_skypilot_config,
313
380
  request_body.override_skypilot_config_path):
314
- # Rejecting requests to workspaces that the user does not have
315
- # permission to access.
316
- workspaces_core.reject_request_for_unauthorized_workspace(user)
381
+ # Skip permission check for sky.workspaces.get request
382
+ # as it is used to determine which workspaces the user
383
+ # has access to.
384
+ if request_name != 'sky.workspaces.get':
385
+ try:
386
+ # Reject requests that the user does not have permission
387
+ # to access.
388
+ workspaces_core.reject_request_for_unauthorized_workspace(
389
+ user)
390
+ except exceptions.PermissionDeniedError as e:
391
+ logger.debug(
392
+ f'{request_id} permission denied to workspace: '
393
+ f'{skypilot_config.get_active_workspace()}: {e}')
394
+ raise e
395
+ logger.debug(
396
+ f'{request_id} permission granted to {request_name} request')
317
397
  yield
318
398
  finally:
319
399
  # We need to call the save_timeline() since atexit will not be
@@ -327,29 +407,6 @@ def override_request_env_and_config(
327
407
  os.environ.update(original_env)
328
408
 
329
409
 
330
- def _redirect_output(file: TextIO) -> Tuple[int, int]:
331
- """Redirect stdout and stderr to the log file."""
332
- fd = file.fileno() # Get the file descriptor from the file object
333
- # Store copies of the original stdout and stderr file descriptors
334
- original_stdout = os.dup(sys.stdout.fileno())
335
- original_stderr = os.dup(sys.stderr.fileno())
336
-
337
- # Copy this fd to stdout and stderr
338
- os.dup2(fd, sys.stdout.fileno())
339
- os.dup2(fd, sys.stderr.fileno())
340
- return original_stdout, original_stderr
341
-
342
-
343
- def _restore_output(original_stdout: int, original_stderr: int) -> None:
344
- """Restore stdout and stderr to their original file descriptors."""
345
- os.dup2(original_stdout, sys.stdout.fileno())
346
- os.dup2(original_stderr, sys.stderr.fileno())
347
-
348
- # Close the duplicate file descriptors
349
- os.close(original_stdout)
350
- os.close(original_stderr)
351
-
352
-
353
410
  def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
354
411
  raise KeyboardInterrupt
355
412
 
@@ -367,76 +424,226 @@ def _request_execution_wrapper(request_id: str,
367
424
  4. Handle the SIGTERM signal to abort the request gracefully.
368
425
  5. Maintain the lifecycle of the temp dir used by the request.
369
426
  """
427
+ pid = multiprocessing.current_process().pid
428
+ proc = psutil.Process(pid)
429
+ rss_begin = proc.memory_info().rss
370
430
  db_utils.set_max_connections(num_db_connections_per_worker)
371
431
  # Handle the SIGTERM signal to abort the request processing gracefully.
372
- signal.signal(signal.SIGTERM, _sigterm_handler)
432
+ # Only set up signal handlers in the main thread, as signal.signal() raises
433
+ # ValueError if called from a non-main thread (e.g., in tests).
434
+ if threading.current_thread() is threading.main_thread():
435
+ signal.signal(signal.SIGTERM, _sigterm_handler)
373
436
 
374
- pid = multiprocessing.current_process().pid
375
437
  logger.info(f'Running request {request_id} with pid {pid}')
376
- with api_requests.update_request(request_id) as request_task:
377
- assert request_task is not None, request_id
378
- log_path = request_task.log_path
379
- request_task.pid = pid
380
- request_task.status = api_requests.RequestStatus.RUNNING
381
- func = request_task.entrypoint
382
- request_body = request_task.request_body
383
- request_name = request_task.name
384
-
385
- # Append to the log file instead of overwriting it since there might be
386
- # logs from previous retries.
387
- with log_path.open('a', encoding='utf-8') as f:
438
+
439
+ original_stdout = original_stderr = None
440
+
441
+ def _save_current_output() -> None:
442
+ """Save the current stdout and stderr file descriptors."""
443
+ nonlocal original_stdout, original_stderr
444
+ original_stdout = os.dup(sys.stdout.fileno())
445
+ original_stderr = os.dup(sys.stderr.fileno())
446
+
447
+ def _redirect_output(file: TextIO) -> None:
448
+ """Redirect stdout and stderr to the log file."""
449
+ # Get the file descriptor from the file object
450
+ fd = file.fileno()
451
+ # Copy this fd to stdout and stderr
452
+ os.dup2(fd, sys.stdout.fileno())
453
+ os.dup2(fd, sys.stderr.fileno())
454
+
455
+ def _restore_output() -> None:
456
+ """Restore stdout and stderr to their original file descriptors."""
457
+ nonlocal original_stdout, original_stderr
458
+ if original_stdout is not None:
459
+ os.dup2(original_stdout, sys.stdout.fileno())
460
+ os.close(original_stdout)
461
+ original_stdout = None
462
+
463
+ if original_stderr is not None:
464
+ os.dup2(original_stderr, sys.stderr.fileno())
465
+ os.close(original_stderr)
466
+ original_stderr = None
467
+
468
+ request_name = None
469
+ try:
470
+ # As soon as the request is updated with the executor PID, we can
471
+ # receive SIGTERM from cancellation. So, we update the request inside
472
+ # the try block to ensure we have the KeyboardInterrupt handling.
473
+ with api_requests.update_request(request_id) as request_task:
474
+ assert request_task is not None, request_id
475
+ if request_task.status != api_requests.RequestStatus.PENDING:
476
+ logger.debug(f'Request is already {request_task.status.value}, '
477
+ f'skipping execution')
478
+ return
479
+ log_path = request_task.log_path
480
+ request_task.pid = pid
481
+ request_task.status = api_requests.RequestStatus.RUNNING
482
+ func = request_task.entrypoint
483
+ request_body = request_task.request_body
484
+ request_name = request_task.name
485
+
388
486
  # Store copies of the original stdout and stderr file descriptors
389
- original_stdout, original_stderr = _redirect_output(f)
390
- # Redirect the stdout/stderr before overriding the environment and
391
- # config, as there can be some logs during override that needs to be
392
- # captured in the log file.
393
- try:
487
+ # We do this in two steps because we should make sure to restore the
488
+ # original values even if we are cancelled or fail during the redirect.
489
+ _save_current_output()
490
+
491
+ # Append to the log file instead of overwriting it since there might be
492
+ # logs from previous retries.
493
+ with log_path.open('a', encoding='utf-8') as f:
494
+ # Redirect the stdout/stderr before overriding the environment and
495
+ # config, as there can be some logs during override that needs to be
496
+ # captured in the log file.
497
+ _redirect_output(f)
498
+
394
499
  with sky_logging.add_debug_log_handler(request_id), \
395
- override_request_env_and_config(request_body, request_id), \
500
+ override_request_env_and_config(
501
+ request_body, request_id, request_name), \
396
502
  tempstore.tempdir():
397
503
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
398
504
  config = skypilot_config.to_dict()
399
505
  logger.debug(f'request config: \n'
400
506
  f'{yaml_utils.dump_yaml_str(dict(config))}')
401
- with metrics_lib.time_it(name=request_name,
402
- group='request_execution'):
507
+ (metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
508
+ labels(request=request_name, pid=pid).inc())
509
+ with metrics_utils.time_it(name=request_name,
510
+ group='request_execution'):
403
511
  return_value = func(**request_body.to_kwargs())
404
512
  f.flush()
405
- except KeyboardInterrupt:
406
- logger.info(f'Request {request_id} cancelled by user')
407
- # Kill all children processes related to this request.
408
- # Each executor handles a single request, so we can safely kill all
409
- # children processes related to this request.
410
- # This is required as python does not pass the KeyboardInterrupt
411
- # to the threads that are not main thread.
412
- subprocess_utils.kill_children_processes()
413
- _restore_output(original_stdout, original_stderr)
414
- return
415
- except exceptions.ExecutionRetryableError as e:
416
- logger.error(e)
417
- logger.info(e.hint)
418
- with api_requests.update_request(request_id) as request_task:
419
- assert request_task is not None, request_id
420
- # Retried request will undergo rescheduling and a new execution,
421
- # clear the pid of the request.
422
- request_task.pid = None
423
- # Yield control to the scheduler for uniform handling of retries.
424
- _restore_output(original_stdout, original_stderr)
425
- raise
426
- except (Exception, SystemExit) as e: # pylint: disable=broad-except
427
- api_requests.set_request_failed(request_id, e)
428
- _restore_output(original_stdout, original_stderr)
429
- logger.info(f'Request {request_id} failed due to '
430
- f'{common_utils.format_exception(e)}')
431
- return
432
- else:
433
- api_requests.set_request_succeeded(
434
- request_id, return_value if not ignore_return_value else None)
435
- _restore_output(original_stdout, original_stderr)
436
- logger.info(f'Request {request_id} finished')
513
+ except KeyboardInterrupt:
514
+ logger.info(f'Request {request_id} cancelled by user')
515
+ # Kill all children processes related to this request.
516
+ # Each executor handles a single request, so we can safely kill all
517
+ # children processes related to this request.
518
+ # This is required as python does not pass the KeyboardInterrupt to the
519
+ # threads that are not main thread.
520
+ subprocess_utils.kill_children_processes()
521
+ return
522
+ except exceptions.ExecutionRetryableError as e:
523
+ logger.error(e)
524
+ logger.info(e.hint)
525
+ with api_requests.update_request(request_id) as request_task:
526
+ assert request_task is not None, request_id
527
+ # Retried request will undergo rescheduling and a new execution,
528
+ # clear the pid of the request.
529
+ request_task.pid = None
530
+ # Yield control to the scheduler for uniform handling of retries.
531
+ _restore_output()
532
+ raise
533
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
534
+ api_requests.set_request_failed(request_id, e)
535
+ # Manually reset the original stdout and stderr file descriptors early
536
+ # so that the "Request xxxx failed due to ..." log message will be
537
+ # written to the original stdout and stderr file descriptors.
538
+ _restore_output()
539
+ logger.error(f'Request {request_id} failed due to '
540
+ f'{common_utils.format_exception(e)}')
541
+ return
542
+ else:
543
+ api_requests.set_request_succeeded(
544
+ request_id, return_value if not ignore_return_value else None)
545
+ # Manually reset the original stdout and stderr file descriptors early
546
+ # so that the "Request xxxx failed due to ..." log message will be
547
+ # written to the original stdout and stderr file descriptors.
548
+ _restore_output()
549
+ logger.info(f'Request {request_id} finished')
550
+ finally:
551
+ _restore_output()
552
+ try:
553
+ # Capture the peak RSS before GC.
554
+ peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
555
+ # Clear request level cache to release all memory used by the
556
+ # request.
557
+ annotations.clear_request_level_cache()
558
+ with metrics_utils.time_it(name='release_memory', group='internal'):
559
+ common_utils.release_memory()
560
+ if request_name is not None:
561
+ _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
562
+ except Exception as e: # pylint: disable=broad-except
563
+ logger.error(f'Failed to record memory metrics: '
564
+ f'{common_utils.format_exception(e)}')
565
+
566
+
567
+ _first_request = True
568
+
569
+
570
+ def _record_memory_metrics(request_name: str, proc: psutil.Process,
571
+ rss_begin: int, peak_rss: int) -> None:
572
+ """Record the memory metrics for a request."""
573
+ # Do not record full memory delta for the first request as it
574
+ # will loads the sky core modules and make the memory usage
575
+ # estimation inaccurate.
576
+ global _first_request
577
+ if _first_request:
578
+ _first_request = False
579
+ return
580
+ rss_end = proc.memory_info().rss
581
+
582
+ # Answer "how much RSS this request contributed?"
583
+ metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
584
+ name=request_name).observe(max(rss_end - rss_begin, 0))
585
+ # Estimate the memory usage by the request by capturing the
586
+ # peak memory delta during the request execution.
587
+ metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
588
+ name=request_name).observe(max(peak_rss - rss_begin, 0))
589
+
590
+
591
+ class CoroutineTask:
592
+ """Wrapper of a background task runs in coroutine"""
593
+
594
+ def __init__(self, task: asyncio.Task):
595
+ self.task = task
596
+
597
+ async def cancel(self):
598
+ try:
599
+ self.task.cancel()
600
+ await self.task
601
+ except asyncio.CancelledError:
602
+ pass
603
+
604
+
605
+ def check_request_thread_executor_available() -> None:
606
+ """Check if the request thread executor is available.
437
607
 
608
+ This is a best effort check to hint the client to retry other server
609
+ processes when there is no avaiable thread worker in current one. But
610
+ a request may pass this check and still cannot get worker on execution
611
+ time due to race condition. In this case, the client will see a failed
612
+ request instead of retry.
438
613
 
439
- async def execute_request_coroutine(request: api_requests.Request):
614
+ TODO(aylei): this can be refined with a refactor of our coroutine
615
+ execution flow.
616
+ """
617
+ get_request_thread_executor().check_available()
618
+
619
+
620
+ def execute_request_in_coroutine(
621
+ request: api_requests.Request) -> CoroutineTask:
622
+ """Execute a request in current event loop.
623
+
624
+ Args:
625
+ request: The request to execute.
626
+
627
+ Returns:
628
+ A CoroutineTask handle to operate the background task.
629
+ """
630
+ task = asyncio.create_task(_execute_request_coroutine(request))
631
+ return CoroutineTask(task)
632
+
633
+
634
+ def _execute_with_config_override(func: Callable,
635
+ request_body: payloads.RequestBody,
636
+ request_id: str, request_name: str,
637
+ **kwargs) -> Any:
638
+ """Execute a function with env and config override inside a thread."""
639
+ # Override the environment and config within this thread's context,
640
+ # which gets copied when we call to_thread.
641
+ with override_request_env_and_config(request_body, request_id,
642
+ request_name):
643
+ return func(**kwargs)
644
+
645
+
646
+ async def _execute_request_coroutine(request: api_requests.Request):
440
647
  """Execute a request in current event loop.
441
648
 
442
649
  Similar to _request_execution_wrapper, but executed as coroutine in current
@@ -449,39 +656,43 @@ async def execute_request_coroutine(request: api_requests.Request):
449
656
  logger.info(f'Executing request {request.request_id} in coroutine')
450
657
  func = request.entrypoint
451
658
  request_body = request.request_body
452
- with api_requests.update_request(request.request_id) as request_task:
453
- request_task.status = api_requests.RequestStatus.RUNNING
659
+ await api_requests.update_status_async(request.request_id,
660
+ api_requests.RequestStatus.RUNNING)
454
661
  # Redirect stdout and stderr to the request log path.
455
662
  original_output = ctx.redirect_log(request.log_path)
456
- # Override environment variables that backs env_options.Options
457
- # TODO(aylei): compared to process executor, running task in coroutine has
458
- # two issues to fix:
459
- # 1. skypilot config is not contextual
460
- # 2. envs that read directly from os.environ are not contextual
461
- ctx.override_envs(request_body.env_vars)
462
- fut: asyncio.Future = context_utils.to_thread(func,
463
- **request_body.to_kwargs())
663
+ try:
664
+ fut: asyncio.Future = context_utils.to_thread_with_executor(
665
+ get_request_thread_executor(), _execute_with_config_override, func,
666
+ request_body, request.request_id, request.name,
667
+ **request_body.to_kwargs())
668
+ except Exception as e: # pylint: disable=broad-except
669
+ ctx.redirect_log(original_output)
670
+ await api_requests.set_request_failed_async(request.request_id, e)
671
+ logger.error(f'Failed to run request {request.request_id} due to '
672
+ f'{common_utils.format_exception(e)}')
673
+ return
464
674
 
465
675
  async def poll_task(request_id: str) -> bool:
466
- request = await api_requests.get_request_async(request_id)
467
- if request is None:
676
+ req_status = await api_requests.get_request_status_async(request_id)
677
+ if req_status is None:
468
678
  raise RuntimeError('Request not found')
469
679
 
470
- if request.status == api_requests.RequestStatus.CANCELLED:
680
+ if req_status.status == api_requests.RequestStatus.CANCELLED:
471
681
  ctx.cancel()
472
682
  return True
473
683
 
474
684
  if fut.done():
475
685
  try:
476
686
  result = await fut
477
- api_requests.set_request_succeeded(request_id, result)
687
+ await api_requests.set_request_succeeded_async(
688
+ request_id, result)
478
689
  except asyncio.CancelledError:
479
690
  # The task is cancelled by ctx.cancel(), where the status
480
691
  # should already be set to CANCELLED.
481
692
  pass
482
693
  except Exception as e: # pylint: disable=broad-except
483
694
  ctx.redirect_log(original_output)
484
- api_requests.set_request_failed(request_id, e)
695
+ await api_requests.set_request_failed_async(request_id, e)
485
696
  logger.error(f'Request {request_id} failed due to '
486
697
  f'{common_utils.format_exception(e)}')
487
698
  return True
@@ -496,22 +707,25 @@ async def execute_request_coroutine(request: api_requests.Request):
496
707
  except asyncio.CancelledError:
497
708
  # Current coroutine is cancelled due to client disconnect, set the
498
709
  # request status for consistency.
499
- api_requests.set_request_cancelled(request.request_id)
710
+ await api_requests.set_request_cancelled_async(request.request_id)
500
711
  pass
501
712
  # pylint: disable=broad-except
502
713
  except (Exception, KeyboardInterrupt, SystemExit) as e:
503
714
  # Handle any other error
504
715
  ctx.redirect_log(original_output)
505
- ctx.cancel()
506
- api_requests.set_request_failed(request.request_id, e)
716
+ await api_requests.set_request_failed_async(request.request_id, e)
507
717
  logger.error(f'Request {request.request_id} interrupted due to '
508
718
  f'unhandled exception: {common_utils.format_exception(e)}')
509
719
  raise
720
+ finally:
721
+ # Always cancel the context to kill potentially running background
722
+ # routine.
723
+ ctx.cancel()
510
724
 
511
725
 
512
- def prepare_request(
726
+ async def prepare_request_async(
513
727
  request_id: str,
514
- request_name: str,
728
+ request_name: request_names.RequestName,
515
729
  request_body: payloads.RequestBody,
516
730
  func: Callable[P, Any],
517
731
  request_cluster_name: Optional[str] = None,
@@ -535,7 +749,7 @@ def prepare_request(
535
749
  user_id=user_id,
536
750
  cluster_name=request_cluster_name)
537
751
 
538
- if not api_requests.create_if_not_exists(request):
752
+ if not await api_requests.create_if_not_exists_async(request):
539
753
  raise exceptions.RequestAlreadyExistsError(
540
754
  f'Request {request_id} already exists.')
541
755
 
@@ -543,17 +757,18 @@ def prepare_request(
543
757
  return request
544
758
 
545
759
 
546
- def schedule_request(request_id: str,
547
- request_name: str,
548
- request_body: payloads.RequestBody,
549
- func: Callable[P, Any],
550
- request_cluster_name: Optional[str] = None,
551
- ignore_return_value: bool = False,
552
- schedule_type: api_requests.ScheduleType = (
553
- api_requests.ScheduleType.LONG),
554
- is_skypilot_system: bool = False,
555
- precondition: Optional[preconditions.Precondition] = None,
556
- retryable: bool = False) -> None:
760
+ async def schedule_request_async(request_id: str,
761
+ request_name: request_names.RequestName,
762
+ request_body: payloads.RequestBody,
763
+ func: Callable[P, Any],
764
+ request_cluster_name: Optional[str] = None,
765
+ ignore_return_value: bool = False,
766
+ schedule_type: api_requests.ScheduleType = (
767
+ api_requests.ScheduleType.LONG),
768
+ is_skypilot_system: bool = False,
769
+ precondition: Optional[
770
+ preconditions.Precondition] = None,
771
+ retryable: bool = False) -> None:
557
772
  """Enqueue a request to the request queue.
558
773
 
559
774
  Args:
@@ -574,13 +789,37 @@ def schedule_request(request_id: str,
574
789
  The precondition is waited asynchronously and does not block the
575
790
  caller.
576
791
  """
577
- prepare_request(request_id, request_name, request_body, func,
578
- request_cluster_name, schedule_type, is_skypilot_system)
792
+ request_task = await prepare_request_async(request_id, request_name,
793
+ request_body, func,
794
+ request_cluster_name,
795
+ schedule_type,
796
+ is_skypilot_system)
797
+ schedule_prepared_request(request_task, ignore_return_value, precondition,
798
+ retryable)
799
+
800
+
801
+ def schedule_prepared_request(request_task: api_requests.Request,
802
+ ignore_return_value: bool = False,
803
+ precondition: Optional[
804
+ preconditions.Precondition] = None,
805
+ retryable: bool = False) -> None:
806
+ """Enqueue a request to the request queue
807
+
808
+ Args:
809
+ request_task: The prepared request task to schedule.
810
+ ignore_return_value: If True, the return value of the function will be
811
+ ignored.
812
+ precondition: If a precondition is provided, the request will only be
813
+ scheduled for execution when the precondition is met (returns True).
814
+ The precondition is waited asynchronously and does not block the
815
+ caller.
816
+ retryable: Whether the request should be retried if it fails.
817
+ """
579
818
 
580
819
  def enqueue():
581
- input_tuple = (request_id, ignore_return_value, retryable)
582
- logger.info(f'Queuing request: {request_id}')
583
- _get_queue(schedule_type).put(input_tuple)
820
+ input_tuple = (request_task.request_id, ignore_return_value, retryable)
821
+ logger.info(f'Queuing request: {request_task.request_id}')
822
+ _get_queue(request_task.schedule_type).put(input_tuple)
584
823
 
585
824
  if precondition is not None:
586
825
  # Wait async to avoid blocking caller.