skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -31,6 +31,7 @@ import time
31
31
  import typing
32
32
  from typing import Any, Callable, Generator, List, Optional, TextIO, Tuple
33
33
 
34
+ import psutil
34
35
  import setproctitle
35
36
 
36
37
  from sky import exceptions
@@ -38,6 +39,7 @@ from sky import global_user_state
38
39
  from sky import models
39
40
  from sky import sky_logging
40
41
  from sky import skypilot_config
42
+ from sky.metrics import utils as metrics_utils
41
43
  from sky.server import common as server_common
42
44
  from sky.server import config as server_config
43
45
  from sky.server import constants as server_constants
@@ -45,7 +47,9 @@ from sky.server import metrics as metrics_lib
45
47
  from sky.server.requests import payloads
46
48
  from sky.server.requests import preconditions
47
49
  from sky.server.requests import process
50
+ from sky.server.requests import request_names
48
51
  from sky.server.requests import requests as api_requests
52
+ from sky.server.requests import threads
49
53
  from sky.server.requests.queues import local_queue
50
54
  from sky.server.requests.queues import mp_queue
51
55
  from sky.skylet import constants
@@ -79,6 +83,31 @@ logger = sky_logging.init_logger(__name__)
79
83
  # platforms, including macOS.
80
84
  multiprocessing.set_start_method('spawn', force=True)
81
85
 
86
+ # An upper limit of max threads for request execution per server process that
87
+ # unlikely to be reached to allow higher concurrency while still prevent the
88
+ # server process become overloaded.
89
+ _REQUEST_THREADS_LIMIT = 128
90
+
91
+ _REQUEST_THREAD_EXECUTOR_LOCK = threading.Lock()
92
+ # A dedicated thread pool executor for synced requests execution in coroutine to
93
+ # avoid:
94
+ # 1. blocking the event loop;
95
+ # 2. exhausting the default thread pool executor of event loop;
96
+ _REQUEST_THREAD_EXECUTOR: Optional[threads.OnDemandThreadExecutor] = None
97
+
98
+
99
+ def get_request_thread_executor() -> threads.OnDemandThreadExecutor:
100
+ """Lazy init and return the request thread executor for current process."""
101
+ global _REQUEST_THREAD_EXECUTOR
102
+ if _REQUEST_THREAD_EXECUTOR is not None:
103
+ return _REQUEST_THREAD_EXECUTOR
104
+ with _REQUEST_THREAD_EXECUTOR_LOCK:
105
+ if _REQUEST_THREAD_EXECUTOR is None:
106
+ _REQUEST_THREAD_EXECUTOR = threads.OnDemandThreadExecutor(
107
+ name='request_thread_executor',
108
+ max_workers=_REQUEST_THREADS_LIMIT)
109
+ return _REQUEST_THREAD_EXECUTOR
110
+
82
111
 
83
112
  class RequestQueue:
84
113
  """The queue for the requests, either redis or multiprocessing.
@@ -130,6 +159,10 @@ queue_backend = server_config.QueueBackend.MULTIPROCESSING
130
159
  def executor_initializer(proc_group: str):
131
160
  setproctitle.setproctitle(f'SkyPilot:executor:{proc_group}:'
132
161
  f'{multiprocessing.current_process().pid}')
162
+ # Executor never stops, unless the whole process is killed.
163
+ threading.Thread(target=metrics_lib.process_monitor,
164
+ args=(f'worker:{proc_group}', threading.Event()),
165
+ daemon=True).start()
133
166
 
134
167
 
135
168
  class RequestWorker:
@@ -182,10 +215,11 @@ class RequestWorker:
182
215
  time.sleep(0.1)
183
216
  return
184
217
  request_id, ignore_return_value, _ = request_element
185
- request = api_requests.get_request(request_id)
218
+ request = api_requests.get_request(request_id, fields=['status'])
186
219
  assert request is not None, f'Request with ID {request_id} is None'
187
220
  if request.status == api_requests.RequestStatus.CANCELLED:
188
221
  return
222
+ del request
189
223
  logger.info(f'[{self}] Submitting request: {request_id}')
190
224
  # Start additional process to run the request, so that it can be
191
225
  # cancelled when requested by a user.
@@ -196,6 +230,12 @@ class RequestWorker:
196
230
  fut = executor.submit_until_success(
197
231
  _request_execution_wrapper, request_id, ignore_return_value,
198
232
  self.num_db_connections_per_worker)
233
+ # Decrement the free executor count when a request starts
234
+ if metrics_utils.METRICS_ENABLED:
235
+ if self.schedule_type == api_requests.ScheduleType.LONG:
236
+ metrics_utils.SKY_APISERVER_LONG_EXECUTORS.dec()
237
+ elif self.schedule_type == api_requests.ScheduleType.SHORT:
238
+ metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.dec()
199
239
  # Monitor the result of the request execution.
200
240
  threading.Thread(target=self.handle_task_result,
201
241
  args=(fut, request_element),
@@ -230,9 +270,23 @@ class RequestWorker:
230
270
  queue.put(request_element)
231
271
  except exceptions.ExecutionRetryableError as e:
232
272
  time.sleep(e.retry_wait_seconds)
273
+ # Reset the request status to PENDING so it can be picked up again.
274
+ # Assume retryable since the error is ExecutionRetryableError.
275
+ request_id, _, _ = request_element
276
+ with api_requests.update_request(request_id) as request_task:
277
+ assert request_task is not None, request_id
278
+ request_task.status = api_requests.RequestStatus.PENDING
233
279
  # Reschedule the request.
234
280
  queue = _get_queue(self.schedule_type)
235
281
  queue.put(request_element)
282
+ logger.info(f'Rescheduled request {request_id} for retry')
283
+ finally:
284
+ # Increment the free executor count when a request finishes
285
+ if metrics_utils.METRICS_ENABLED:
286
+ if self.schedule_type == api_requests.ScheduleType.LONG:
287
+ metrics_utils.SKY_APISERVER_LONG_EXECUTORS.inc()
288
+ elif self.schedule_type == api_requests.ScheduleType.SHORT:
289
+ metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.inc()
236
290
 
237
291
  def run(self) -> None:
238
292
  # Handle the SIGTERM signal to abort the executor process gracefully.
@@ -254,6 +308,16 @@ class RequestWorker:
254
308
  burst_workers=self.burstable_parallelism,
255
309
  initializer=executor_initializer,
256
310
  initargs=(proc_group,))
311
+ # Initialize the appropriate gauge for the number of free executors
312
+ total_executors = (self.garanteed_parallelism +
313
+ self.burstable_parallelism)
314
+ if metrics_utils.METRICS_ENABLED:
315
+ if self.schedule_type == api_requests.ScheduleType.LONG:
316
+ metrics_utils.SKY_APISERVER_LONG_EXECUTORS.set(
317
+ total_executors)
318
+ elif self.schedule_type == api_requests.ScheduleType.SHORT:
319
+ metrics_utils.SKY_APISERVER_SHORT_EXECUTORS.set(
320
+ total_executors)
257
321
  while not self._cancel_event.is_set():
258
322
  self.process_request(executor, queue)
259
323
  # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
@@ -277,43 +341,56 @@ def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
277
341
 
278
342
  @contextlib.contextmanager
279
343
  def override_request_env_and_config(
280
- request_body: payloads.RequestBody,
281
- request_id: str) -> Generator[None, None, None]:
344
+ request_body: payloads.RequestBody, request_id: str,
345
+ request_name: str) -> Generator[None, None, None]:
282
346
  """Override the environment and SkyPilot config for a request."""
283
347
  original_env = os.environ.copy()
284
- # Unset SKYPILOT_DEBUG by default, to avoid the value set on the API server
285
- # affecting client requests. If set on the client side, it will be
286
- # overridden by the request body.
287
- os.environ.pop('SKYPILOT_DEBUG', None)
288
- os.environ.update(request_body.env_vars)
289
- # Note: may be overridden by AuthProxyMiddleware.
290
- # TODO(zhwu): we need to make the entire request a context available to the
291
- # entire request execution, so that we can access info like user through
292
- # the execution.
293
- user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
294
- name=request_body.env_vars[constants.USER_ENV_VAR])
295
- global_user_state.add_or_update_user(user)
296
- # Refetch the user to get the latest user info, including the created_at
297
- # field.
298
- user = global_user_state.get_user(user.id)
299
-
300
- # Force color to be enabled.
301
- os.environ['CLICOLOR_FORCE'] = '1'
302
- server_common.reload_for_new_request(
303
- client_entrypoint=request_body.entrypoint,
304
- client_command=request_body.entrypoint_command,
305
- using_remote_api_server=request_body.using_remote_api_server,
306
- user=user,
307
- request_id=request_id)
308
348
  try:
349
+ # Unset SKYPILOT_DEBUG by default, to avoid the value set on the API
350
+ # server affecting client requests. If set on the client side, it will
351
+ # be overridden by the request body.
352
+ os.environ.pop('SKYPILOT_DEBUG', None)
353
+ # Remove the db connection uri from client supplied env vars, as the
354
+ # client should not set the db string on server side.
355
+ request_body.env_vars.pop(constants.ENV_VAR_DB_CONNECTION_URI, None)
356
+ os.environ.update(request_body.env_vars)
357
+ # Note: may be overridden by AuthProxyMiddleware.
358
+ # TODO(zhwu): we need to make the entire request a context available to
359
+ # the entire request execution, so that we can access info like user
360
+ # through the execution.
361
+ user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
362
+ name=request_body.env_vars[constants.USER_ENV_VAR])
363
+ _, user = global_user_state.add_or_update_user(user, return_user=True)
364
+
365
+ # Force color to be enabled.
366
+ os.environ['CLICOLOR_FORCE'] = '1'
367
+ server_common.reload_for_new_request(
368
+ client_entrypoint=request_body.entrypoint,
369
+ client_command=request_body.entrypoint_command,
370
+ using_remote_api_server=request_body.using_remote_api_server,
371
+ user=user,
372
+ request_id=request_id)
309
373
  logger.debug(
310
374
  f'override path: {request_body.override_skypilot_config_path}')
311
375
  with skypilot_config.override_skypilot_config(
312
376
  request_body.override_skypilot_config,
313
377
  request_body.override_skypilot_config_path):
314
- # Rejecting requests to workspaces that the user does not have
315
- # permission to access.
316
- workspaces_core.reject_request_for_unauthorized_workspace(user)
378
+ # Skip permission check for sky.workspaces.get request
379
+ # as it is used to determine which workspaces the user
380
+ # has access to.
381
+ if request_name != 'sky.workspaces.get':
382
+ try:
383
+ # Reject requests that the user does not have permission
384
+ # to access.
385
+ workspaces_core.reject_request_for_unauthorized_workspace(
386
+ user)
387
+ except exceptions.PermissionDeniedError as e:
388
+ logger.debug(
389
+ f'{request_id} permission denied to workspace: '
390
+ f'{skypilot_config.get_active_workspace()}: {e}')
391
+ raise e
392
+ logger.debug(
393
+ f'{request_id} permission granted to {request_name} request')
317
394
  yield
318
395
  finally:
319
396
  # We need to call the save_timeline() since atexit will not be
@@ -327,29 +404,6 @@ def override_request_env_and_config(
327
404
  os.environ.update(original_env)
328
405
 
329
406
 
330
- def _redirect_output(file: TextIO) -> Tuple[int, int]:
331
- """Redirect stdout and stderr to the log file."""
332
- fd = file.fileno() # Get the file descriptor from the file object
333
- # Store copies of the original stdout and stderr file descriptors
334
- original_stdout = os.dup(sys.stdout.fileno())
335
- original_stderr = os.dup(sys.stderr.fileno())
336
-
337
- # Copy this fd to stdout and stderr
338
- os.dup2(fd, sys.stdout.fileno())
339
- os.dup2(fd, sys.stderr.fileno())
340
- return original_stdout, original_stderr
341
-
342
-
343
- def _restore_output(original_stdout: int, original_stderr: int) -> None:
344
- """Restore stdout and stderr to their original file descriptors."""
345
- os.dup2(original_stdout, sys.stdout.fileno())
346
- os.dup2(original_stderr, sys.stderr.fileno())
347
-
348
- # Close the duplicate file descriptors
349
- os.close(original_stdout)
350
- os.close(original_stderr)
351
-
352
-
353
407
  def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
354
408
  raise KeyboardInterrupt
355
409
 
@@ -367,76 +421,226 @@ def _request_execution_wrapper(request_id: str,
367
421
  4. Handle the SIGTERM signal to abort the request gracefully.
368
422
  5. Maintain the lifecycle of the temp dir used by the request.
369
423
  """
424
+ pid = multiprocessing.current_process().pid
425
+ proc = psutil.Process(pid)
426
+ rss_begin = proc.memory_info().rss
370
427
  db_utils.set_max_connections(num_db_connections_per_worker)
371
428
  # Handle the SIGTERM signal to abort the request processing gracefully.
372
- signal.signal(signal.SIGTERM, _sigterm_handler)
429
+ # Only set up signal handlers in the main thread, as signal.signal() raises
430
+ # ValueError if called from a non-main thread (e.g., in tests).
431
+ if threading.current_thread() is threading.main_thread():
432
+ signal.signal(signal.SIGTERM, _sigterm_handler)
373
433
 
374
- pid = multiprocessing.current_process().pid
375
434
  logger.info(f'Running request {request_id} with pid {pid}')
376
- with api_requests.update_request(request_id) as request_task:
377
- assert request_task is not None, request_id
378
- log_path = request_task.log_path
379
- request_task.pid = pid
380
- request_task.status = api_requests.RequestStatus.RUNNING
381
- func = request_task.entrypoint
382
- request_body = request_task.request_body
383
- request_name = request_task.name
384
-
385
- # Append to the log file instead of overwriting it since there might be
386
- # logs from previous retries.
387
- with log_path.open('a', encoding='utf-8') as f:
435
+
436
+ original_stdout = original_stderr = None
437
+
438
+ def _save_current_output() -> None:
439
+ """Save the current stdout and stderr file descriptors."""
440
+ nonlocal original_stdout, original_stderr
441
+ original_stdout = os.dup(sys.stdout.fileno())
442
+ original_stderr = os.dup(sys.stderr.fileno())
443
+
444
+ def _redirect_output(file: TextIO) -> None:
445
+ """Redirect stdout and stderr to the log file."""
446
+ # Get the file descriptor from the file object
447
+ fd = file.fileno()
448
+ # Copy this fd to stdout and stderr
449
+ os.dup2(fd, sys.stdout.fileno())
450
+ os.dup2(fd, sys.stderr.fileno())
451
+
452
+ def _restore_output() -> None:
453
+ """Restore stdout and stderr to their original file descriptors."""
454
+ nonlocal original_stdout, original_stderr
455
+ if original_stdout is not None:
456
+ os.dup2(original_stdout, sys.stdout.fileno())
457
+ os.close(original_stdout)
458
+ original_stdout = None
459
+
460
+ if original_stderr is not None:
461
+ os.dup2(original_stderr, sys.stderr.fileno())
462
+ os.close(original_stderr)
463
+ original_stderr = None
464
+
465
+ request_name = None
466
+ try:
467
+ # As soon as the request is updated with the executor PID, we can
468
+ # receive SIGTERM from cancellation. So, we update the request inside
469
+ # the try block to ensure we have the KeyboardInterrupt handling.
470
+ with api_requests.update_request(request_id) as request_task:
471
+ assert request_task is not None, request_id
472
+ if request_task.status != api_requests.RequestStatus.PENDING:
473
+ logger.debug(f'Request is already {request_task.status.value}, '
474
+ f'skipping execution')
475
+ return
476
+ log_path = request_task.log_path
477
+ request_task.pid = pid
478
+ request_task.status = api_requests.RequestStatus.RUNNING
479
+ func = request_task.entrypoint
480
+ request_body = request_task.request_body
481
+ request_name = request_task.name
482
+
388
483
  # Store copies of the original stdout and stderr file descriptors
389
- original_stdout, original_stderr = _redirect_output(f)
390
- # Redirect the stdout/stderr before overriding the environment and
391
- # config, as there can be some logs during override that needs to be
392
- # captured in the log file.
393
- try:
484
+ # We do this in two steps because we should make sure to restore the
485
+ # original values even if we are cancelled or fail during the redirect.
486
+ _save_current_output()
487
+
488
+ # Append to the log file instead of overwriting it since there might be
489
+ # logs from previous retries.
490
+ with log_path.open('a', encoding='utf-8') as f:
491
+ # Redirect the stdout/stderr before overriding the environment and
492
+ # config, as there can be some logs during override that needs to be
493
+ # captured in the log file.
494
+ _redirect_output(f)
495
+
394
496
  with sky_logging.add_debug_log_handler(request_id), \
395
- override_request_env_and_config(request_body, request_id), \
497
+ override_request_env_and_config(
498
+ request_body, request_id, request_name), \
396
499
  tempstore.tempdir():
397
500
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
398
501
  config = skypilot_config.to_dict()
399
502
  logger.debug(f'request config: \n'
400
503
  f'{yaml_utils.dump_yaml_str(dict(config))}')
401
- with metrics_lib.time_it(name=request_name,
402
- group='request_execution'):
504
+ (metrics_utils.SKY_APISERVER_PROCESS_EXECUTION_START_TOTAL.
505
+ labels(request=request_name, pid=pid).inc())
506
+ with metrics_utils.time_it(name=request_name,
507
+ group='request_execution'):
403
508
  return_value = func(**request_body.to_kwargs())
404
509
  f.flush()
405
- except KeyboardInterrupt:
406
- logger.info(f'Request {request_id} cancelled by user')
407
- # Kill all children processes related to this request.
408
- # Each executor handles a single request, so we can safely kill all
409
- # children processes related to this request.
410
- # This is required as python does not pass the KeyboardInterrupt
411
- # to the threads that are not main thread.
412
- subprocess_utils.kill_children_processes()
413
- _restore_output(original_stdout, original_stderr)
414
- return
415
- except exceptions.ExecutionRetryableError as e:
416
- logger.error(e)
417
- logger.info(e.hint)
418
- with api_requests.update_request(request_id) as request_task:
419
- assert request_task is not None, request_id
420
- # Retried request will undergo rescheduling and a new execution,
421
- # clear the pid of the request.
422
- request_task.pid = None
423
- # Yield control to the scheduler for uniform handling of retries.
424
- _restore_output(original_stdout, original_stderr)
425
- raise
426
- except (Exception, SystemExit) as e: # pylint: disable=broad-except
427
- api_requests.set_request_failed(request_id, e)
428
- _restore_output(original_stdout, original_stderr)
429
- logger.info(f'Request {request_id} failed due to '
430
- f'{common_utils.format_exception(e)}')
431
- return
432
- else:
433
- api_requests.set_request_succeeded(
434
- request_id, return_value if not ignore_return_value else None)
435
- _restore_output(original_stdout, original_stderr)
436
- logger.info(f'Request {request_id} finished')
510
+ except KeyboardInterrupt:
511
+ logger.info(f'Request {request_id} cancelled by user')
512
+ # Kill all children processes related to this request.
513
+ # Each executor handles a single request, so we can safely kill all
514
+ # children processes related to this request.
515
+ # This is required as python does not pass the KeyboardInterrupt to the
516
+ # threads that are not main thread.
517
+ subprocess_utils.kill_children_processes()
518
+ return
519
+ except exceptions.ExecutionRetryableError as e:
520
+ logger.error(e)
521
+ logger.info(e.hint)
522
+ with api_requests.update_request(request_id) as request_task:
523
+ assert request_task is not None, request_id
524
+ # Retried request will undergo rescheduling and a new execution,
525
+ # clear the pid of the request.
526
+ request_task.pid = None
527
+ # Yield control to the scheduler for uniform handling of retries.
528
+ _restore_output()
529
+ raise
530
+ except (Exception, SystemExit) as e: # pylint: disable=broad-except
531
+ api_requests.set_request_failed(request_id, e)
532
+ # Manually reset the original stdout and stderr file descriptors early
533
+ # so that the "Request xxxx failed due to ..." log message will be
534
+ # written to the original stdout and stderr file descriptors.
535
+ _restore_output()
536
+ logger.info(f'Request {request_id} failed due to '
537
+ f'{common_utils.format_exception(e)}')
538
+ return
539
+ else:
540
+ api_requests.set_request_succeeded(
541
+ request_id, return_value if not ignore_return_value else None)
542
+ # Manually reset the original stdout and stderr file descriptors early
543
+ # so that the "Request xxxx failed due to ..." log message will be
544
+ # written to the original stdout and stderr file descriptors.
545
+ _restore_output()
546
+ logger.info(f'Request {request_id} finished')
547
+ finally:
548
+ _restore_output()
549
+ try:
550
+ # Capture the peak RSS before GC.
551
+ peak_rss = max(proc.memory_info().rss, metrics_lib.peak_rss_bytes)
552
+ # Clear request level cache to release all memory used by the
553
+ # request.
554
+ annotations.clear_request_level_cache()
555
+ with metrics_utils.time_it(name='release_memory', group='internal'):
556
+ common_utils.release_memory()
557
+ if request_name is not None:
558
+ _record_memory_metrics(request_name, proc, rss_begin, peak_rss)
559
+ except Exception as e: # pylint: disable=broad-except
560
+ logger.error(f'Failed to record memory metrics: '
561
+ f'{common_utils.format_exception(e)}')
562
+
563
+
564
+ _first_request = True
565
+
566
+
567
+ def _record_memory_metrics(request_name: str, proc: psutil.Process,
568
+ rss_begin: int, peak_rss: int) -> None:
569
+ """Record the memory metrics for a request."""
570
+ # Do not record full memory delta for the first request as it
571
+ # will loads the sky core modules and make the memory usage
572
+ # estimation inaccurate.
573
+ global _first_request
574
+ if _first_request:
575
+ _first_request = False
576
+ return
577
+ rss_end = proc.memory_info().rss
578
+
579
+ # Answer "how much RSS this request contributed?"
580
+ metrics_utils.SKY_APISERVER_REQUEST_RSS_INCR_BYTES.labels(
581
+ name=request_name).observe(max(rss_end - rss_begin, 0))
582
+ # Estimate the memory usage by the request by capturing the
583
+ # peak memory delta during the request execution.
584
+ metrics_utils.SKY_APISERVER_REQUEST_MEMORY_USAGE_BYTES.labels(
585
+ name=request_name).observe(max(peak_rss - rss_begin, 0))
586
+
587
+
588
+ class CoroutineTask:
589
+ """Wrapper of a background task runs in coroutine"""
590
+
591
+ def __init__(self, task: asyncio.Task):
592
+ self.task = task
593
+
594
+ async def cancel(self):
595
+ try:
596
+ self.task.cancel()
597
+ await self.task
598
+ except asyncio.CancelledError:
599
+ pass
600
+
601
+
602
+ def check_request_thread_executor_available() -> None:
603
+ """Check if the request thread executor is available.
437
604
 
605
+ This is a best effort check to hint the client to retry other server
606
+ processes when there is no avaiable thread worker in current one. But
607
+ a request may pass this check and still cannot get worker on execution
608
+ time due to race condition. In this case, the client will see a failed
609
+ request instead of retry.
438
610
 
439
- async def execute_request_coroutine(request: api_requests.Request):
611
+ TODO(aylei): this can be refined with a refactor of our coroutine
612
+ execution flow.
613
+ """
614
+ get_request_thread_executor().check_available()
615
+
616
+
617
+ def execute_request_in_coroutine(
618
+ request: api_requests.Request) -> CoroutineTask:
619
+ """Execute a request in current event loop.
620
+
621
+ Args:
622
+ request: The request to execute.
623
+
624
+ Returns:
625
+ A CoroutineTask handle to operate the background task.
626
+ """
627
+ task = asyncio.create_task(_execute_request_coroutine(request))
628
+ return CoroutineTask(task)
629
+
630
+
631
+ def _execute_with_config_override(func: Callable,
632
+ request_body: payloads.RequestBody,
633
+ request_id: str, request_name: str,
634
+ **kwargs) -> Any:
635
+ """Execute a function with env and config override inside a thread."""
636
+ # Override the environment and config within this thread's context,
637
+ # which gets copied when we call to_thread.
638
+ with override_request_env_and_config(request_body, request_id,
639
+ request_name):
640
+ return func(**kwargs)
641
+
642
+
643
+ async def _execute_request_coroutine(request: api_requests.Request):
440
644
  """Execute a request in current event loop.
441
645
 
442
646
  Similar to _request_execution_wrapper, but executed as coroutine in current
@@ -449,39 +653,43 @@ async def execute_request_coroutine(request: api_requests.Request):
449
653
  logger.info(f'Executing request {request.request_id} in coroutine')
450
654
  func = request.entrypoint
451
655
  request_body = request.request_body
452
- with api_requests.update_request(request.request_id) as request_task:
453
- request_task.status = api_requests.RequestStatus.RUNNING
656
+ await api_requests.update_status_async(request.request_id,
657
+ api_requests.RequestStatus.RUNNING)
454
658
  # Redirect stdout and stderr to the request log path.
455
659
  original_output = ctx.redirect_log(request.log_path)
456
- # Override environment variables that backs env_options.Options
457
- # TODO(aylei): compared to process executor, running task in coroutine has
458
- # two issues to fix:
459
- # 1. skypilot config is not contextual
460
- # 2. envs that read directly from os.environ are not contextual
461
- ctx.override_envs(request_body.env_vars)
462
- fut: asyncio.Future = context_utils.to_thread(func,
463
- **request_body.to_kwargs())
660
+ try:
661
+ fut: asyncio.Future = context_utils.to_thread_with_executor(
662
+ get_request_thread_executor(), _execute_with_config_override, func,
663
+ request_body, request.request_id, request.name,
664
+ **request_body.to_kwargs())
665
+ except Exception as e: # pylint: disable=broad-except
666
+ ctx.redirect_log(original_output)
667
+ await api_requests.set_request_failed_async(request.request_id, e)
668
+ logger.error(f'Failed to run request {request.request_id} due to '
669
+ f'{common_utils.format_exception(e)}')
670
+ return
464
671
 
465
672
  async def poll_task(request_id: str) -> bool:
466
- request = await api_requests.get_request_async(request_id)
467
- if request is None:
673
+ req_status = await api_requests.get_request_status_async(request_id)
674
+ if req_status is None:
468
675
  raise RuntimeError('Request not found')
469
676
 
470
- if request.status == api_requests.RequestStatus.CANCELLED:
677
+ if req_status.status == api_requests.RequestStatus.CANCELLED:
471
678
  ctx.cancel()
472
679
  return True
473
680
 
474
681
  if fut.done():
475
682
  try:
476
683
  result = await fut
477
- api_requests.set_request_succeeded(request_id, result)
684
+ await api_requests.set_request_succeeded_async(
685
+ request_id, result)
478
686
  except asyncio.CancelledError:
479
687
  # The task is cancelled by ctx.cancel(), where the status
480
688
  # should already be set to CANCELLED.
481
689
  pass
482
690
  except Exception as e: # pylint: disable=broad-except
483
691
  ctx.redirect_log(original_output)
484
- api_requests.set_request_failed(request_id, e)
692
+ await api_requests.set_request_failed_async(request_id, e)
485
693
  logger.error(f'Request {request_id} failed due to '
486
694
  f'{common_utils.format_exception(e)}')
487
695
  return True
@@ -496,22 +704,25 @@ async def execute_request_coroutine(request: api_requests.Request):
496
704
  except asyncio.CancelledError:
497
705
  # Current coroutine is cancelled due to client disconnect, set the
498
706
  # request status for consistency.
499
- api_requests.set_request_cancelled(request.request_id)
707
+ await api_requests.set_request_cancelled_async(request.request_id)
500
708
  pass
501
709
  # pylint: disable=broad-except
502
710
  except (Exception, KeyboardInterrupt, SystemExit) as e:
503
711
  # Handle any other error
504
712
  ctx.redirect_log(original_output)
505
- ctx.cancel()
506
- api_requests.set_request_failed(request.request_id, e)
713
+ await api_requests.set_request_failed_async(request.request_id, e)
507
714
  logger.error(f'Request {request.request_id} interrupted due to '
508
715
  f'unhandled exception: {common_utils.format_exception(e)}')
509
716
  raise
717
+ finally:
718
+ # Always cancel the context to kill potentially running background
719
+ # routine.
720
+ ctx.cancel()
510
721
 
511
722
 
512
- def prepare_request(
723
+ async def prepare_request_async(
513
724
  request_id: str,
514
- request_name: str,
725
+ request_name: request_names.RequestName,
515
726
  request_body: payloads.RequestBody,
516
727
  func: Callable[P, Any],
517
728
  request_cluster_name: Optional[str] = None,
@@ -535,7 +746,7 @@ def prepare_request(
535
746
  user_id=user_id,
536
747
  cluster_name=request_cluster_name)
537
748
 
538
- if not api_requests.create_if_not_exists(request):
749
+ if not await api_requests.create_if_not_exists_async(request):
539
750
  raise exceptions.RequestAlreadyExistsError(
540
751
  f'Request {request_id} already exists.')
541
752
 
@@ -543,17 +754,18 @@ def prepare_request(
543
754
  return request
544
755
 
545
756
 
546
- def schedule_request(request_id: str,
547
- request_name: str,
548
- request_body: payloads.RequestBody,
549
- func: Callable[P, Any],
550
- request_cluster_name: Optional[str] = None,
551
- ignore_return_value: bool = False,
552
- schedule_type: api_requests.ScheduleType = (
553
- api_requests.ScheduleType.LONG),
554
- is_skypilot_system: bool = False,
555
- precondition: Optional[preconditions.Precondition] = None,
556
- retryable: bool = False) -> None:
757
+ async def schedule_request_async(request_id: str,
758
+ request_name: request_names.RequestName,
759
+ request_body: payloads.RequestBody,
760
+ func: Callable[P, Any],
761
+ request_cluster_name: Optional[str] = None,
762
+ ignore_return_value: bool = False,
763
+ schedule_type: api_requests.ScheduleType = (
764
+ api_requests.ScheduleType.LONG),
765
+ is_skypilot_system: bool = False,
766
+ precondition: Optional[
767
+ preconditions.Precondition] = None,
768
+ retryable: bool = False) -> None:
557
769
  """Enqueue a request to the request queue.
558
770
 
559
771
  Args:
@@ -574,13 +786,37 @@ def schedule_request(request_id: str,
574
786
  The precondition is waited asynchronously and does not block the
575
787
  caller.
576
788
  """
577
- prepare_request(request_id, request_name, request_body, func,
578
- request_cluster_name, schedule_type, is_skypilot_system)
789
+ request_task = await prepare_request_async(request_id, request_name,
790
+ request_body, func,
791
+ request_cluster_name,
792
+ schedule_type,
793
+ is_skypilot_system)
794
+ schedule_prepared_request(request_task, ignore_return_value, precondition,
795
+ retryable)
796
+
797
+
798
+ def schedule_prepared_request(request_task: api_requests.Request,
799
+ ignore_return_value: bool = False,
800
+ precondition: Optional[
801
+ preconditions.Precondition] = None,
802
+ retryable: bool = False) -> None:
803
+ """Enqueue a request to the request queue
804
+
805
+ Args:
806
+ request_task: The prepared request task to schedule.
807
+ ignore_return_value: If True, the return value of the function will be
808
+ ignored.
809
+ precondition: If a precondition is provided, the request will only be
810
+ scheduled for execution when the precondition is met (returns True).
811
+ The precondition is waited asynchronously and does not block the
812
+ caller.
813
+ retryable: Whether the request should be retried if it fails.
814
+ """
579
815
 
580
816
  def enqueue():
581
- input_tuple = (request_id, ignore_return_value, retryable)
582
- logger.info(f'Queuing request: {request_id}')
583
- _get_queue(schedule_type).put(input_tuple)
817
+ input_tuple = (request_task.request_id, ignore_return_value, retryable)
818
+ logger.info(f'Queuing request: {request_task.request_id}')
819
+ _get_queue(request_task.schedule_type).put(input_tuple)
584
820
 
585
821
  if precondition is not None:
586
822
  # Wait async to avoid blocking caller.