skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/server/common.py CHANGED
@@ -17,7 +17,6 @@ import time
17
17
  import typing
18
18
  from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
19
19
  Tuple, TypeVar, Union)
20
- from urllib import parse
21
20
  import uuid
22
21
 
23
22
  import cachetools
@@ -342,18 +341,7 @@ def get_server_url(host: Optional[str] = None) -> str:
342
341
  @annotations.lru_cache(scope='global')
343
342
  def get_dashboard_url(server_url: str,
344
343
  starting_page: Optional[str] = None) -> str:
345
- # The server_url may include username or password with the
346
- # format of https://username:password@example.com:8080/path
347
- # We need to remove the username and password and only
348
- # return `https://example.com:8080/path`
349
- parsed = parse.urlparse(server_url)
350
- # Reconstruct the URL without credentials but keeping the scheme
351
- dashboard_url = f'{parsed.scheme}://{parsed.hostname}'
352
- if parsed.port:
353
- dashboard_url = f'{dashboard_url}:{parsed.port}'
354
- if parsed.path:
355
- dashboard_url = f'{dashboard_url}{parsed.path}'
356
- dashboard_url = dashboard_url.rstrip('/')
344
+ dashboard_url = server_url.rstrip('/')
357
345
  dashboard_url = f'{dashboard_url}/dashboard'
358
346
  if starting_page:
359
347
  dashboard_url = f'{dashboard_url}/{starting_page}'
@@ -490,6 +478,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
490
478
  def handle_request_error(response: 'requests.Response') -> None:
491
479
  # Keep the original HTTPError if the response code >= 400
492
480
  response.raise_for_status()
481
+
493
482
  # Other status codes are not expected neither, e.g. we do not expect to
494
483
  # handle redirection here.
495
484
  if response.status_code != 200:
@@ -515,6 +504,19 @@ def get_request_id(response: 'requests.Response') -> RequestId[T]:
515
504
  return RequestId[T](request_id)
516
505
 
517
506
 
507
+ def get_stream_request_id(
508
+ response: 'requests.Response') -> Optional[RequestId[T]]:
509
+ """This is same as the above function, but just for `sdk.stream_and_get.
510
+ We do this because `/api/stream` may choose the latest request id, and
511
+ we need to keep track of that information. Request id in this case can
512
+ be None."""
513
+ handle_request_error(response)
514
+ request_id = response.headers.get(server_constants.STREAM_REQUEST_HEADER)
515
+ if request_id is not None:
516
+ return RequestId[T](request_id)
517
+ return None
518
+
519
+
518
520
  def _start_api_server(deploy: bool = False,
519
521
  host: str = '127.0.0.1',
520
522
  foreground: bool = False,
@@ -537,14 +539,27 @@ def _start_api_server(deploy: bool = False,
537
539
  'is not a local URL')
538
540
 
539
541
  # Check available memory before starting the server.
540
- avail_mem_size_gb: float = common_utils.get_mem_size_gb()
541
- if avail_mem_size_gb <= server_constants.MIN_AVAIL_MEM_GB:
542
- logger.warning(
543
- f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
544
- f'has {avail_mem_size_gb:.1f}GB memory available. '
545
- f'At least {server_constants.MIN_AVAIL_MEM_GB}GB is '
546
- 'recommended to support higher load with better performance.'
547
- f'{colorama.Style.RESET_ALL}')
542
+ # Skip this warning if postgres is used, as:
543
+ # 1) that's almost certainly a remote API server;
544
+ # 2) the actual consolidation mode config is stashed in the database,
545
+ # and the value of `job_utils.is_consolidation_mode` will not be
546
+ # the actual value in the db, but only None as in this case, the
547
+ # whole YAML config is really just `db: <URI>`.
548
+ if skypilot_config.get_nested(('db',), None) is None:
549
+ avail_mem_size_gb: float = common_utils.get_mem_size_gb()
550
+ # pylint: disable=import-outside-toplevel
551
+ import sky.jobs.utils as job_utils
552
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
553
+ if job_utils.is_consolidation_mode(
554
+ on_api_restart=True) else
555
+ server_constants.MIN_AVAIL_MEM_GB)
556
+ if avail_mem_size_gb <= max_memory:
557
+ logger.warning(
558
+ f'{colorama.Fore.YELLOW}Your SkyPilot API server machine '
559
+ f'only has {avail_mem_size_gb:.1f}GB memory available. '
560
+ f'At least {max_memory}GB is recommended to support higher '
561
+ 'load with better performance.'
562
+ f'{colorama.Style.RESET_ALL}')
548
563
 
549
564
  args = [sys.executable, *API_SERVER_CMD.split()]
550
565
  if deploy:
@@ -762,6 +777,7 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
762
777
  os.path.expanduser(constants.API_SERVER_CREATION_LOCK_PATH)):
763
778
  # Check again if server is already running. Other processes may
764
779
  # have started the server while we were waiting for the lock.
780
+ get_api_server_status.cache_clear() # type: ignore[attr-defined]
765
781
  api_server_info = get_api_server_status(endpoint)
766
782
  if api_server_info.status == ApiServerStatus.UNHEALTHY:
767
783
  _start_api_server(deploy, host, foreground, metrics,
@@ -823,7 +839,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
823
839
  for task_config in task_configs:
824
840
  if task_config is None:
825
841
  continue
826
- file_mounts_mapping = task_config.get('file_mounts_mapping', {})
842
+ file_mounts_mapping = task_config.pop('file_mounts_mapping', {})
827
843
  if not file_mounts_mapping:
828
844
  # We did not mount any files to new paths on the remote server
829
845
  # so no need to resolve filepaths.
@@ -895,12 +911,18 @@ def reload_for_new_request(client_entrypoint: Optional[str],
895
911
  client_command: Optional[str],
896
912
  using_remote_api_server: bool, user: 'models.User',
897
913
  request_id: str) -> None:
898
- """Reload modules, global variables, and usage message for a new request."""
914
+ """Reload modules, global variables, and usage message for a new request.
915
+
916
+ Must be called within the request's context.
917
+ """
899
918
  # This should be called first to make sure the logger is up-to-date.
900
919
  sky_logging.reload_logger()
901
920
 
902
921
  # Reload the skypilot config to make sure the latest config is used.
903
- skypilot_config.safe_reload_config()
922
+ # We don't need to grab the lock here because this function is only
923
+ # run once we are inside the request's context, so there shouldn't
924
+ # be any race conditions when reloading the config.
925
+ skypilot_config.reload_config()
904
926
 
905
927
  # Reset the client entrypoint and command for the usage message.
906
928
  common_utils.set_request_context(
@@ -931,6 +953,7 @@ def clear_local_api_server_database() -> None:
931
953
  db_path = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
932
954
  for extension in ['', '-shm', '-wal']:
933
955
  try:
956
+ logger.debug(f'Removing database file {db_path}{extension}')
934
957
  os.remove(f'{db_path}{extension}')
935
958
  except FileNotFoundError:
936
959
  logger.debug(f'Database file {db_path}{extension} not found.')
sky/server/config.py CHANGED
@@ -6,6 +6,7 @@ from typing import Optional
6
6
 
7
7
  from sky import sky_logging
8
8
  from sky.server import constants as server_constants
9
+ from sky.server import daemons
9
10
  from sky.utils import common_utils
10
11
 
11
12
  # Constants based on profiling the peak memory usage while serving various
@@ -19,8 +20,9 @@ from sky.utils import common_utils
19
20
  # TODO(aylei): maintaining these constants is error-prone, we may need to
20
21
  # automatically tune parallelism at runtime according to system usage stats
21
22
  # in the future.
22
- _LONG_WORKER_MEM_GB = 0.4
23
- _SHORT_WORKER_MEM_GB = 0.25
23
+ # TODO(luca): The future is now! ^^^
24
+ LONG_WORKER_MEM_GB = 0.4
25
+ SHORT_WORKER_MEM_GB = 0.3
24
26
  # To control the number of long workers.
25
27
  _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
26
28
  # Limit the number of long workers of local API server, since local server is
@@ -35,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
35
37
  _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
36
38
  # Minimal number of long workers to ensure responsiveness.
37
39
  _MIN_LONG_WORKERS = 1
38
- # Minimal number of short workers, there is a daemon task running on short
39
- # workers so at least 2 workers are needed to ensure responsiveness.
40
- _MIN_SHORT_WORKERS = 2
40
+ # Minimal number of idle short workers to ensure responsiveness.
41
+ _MIN_IDLE_SHORT_WORKERS = 1
41
42
 
42
43
  # Default number of burstable workers for local API server. A heuristic number
43
44
  # that is large enough for most local cases.
@@ -74,9 +75,11 @@ class ServerConfig:
74
75
  queue_backend: QueueBackend
75
76
 
76
77
 
77
- def compute_server_config(deploy: bool,
78
- max_db_connections: Optional[int] = None
79
- ) -> ServerConfig:
78
+ def compute_server_config(
79
+ deploy: bool,
80
+ max_db_connections: Optional[int] = None,
81
+ quiet: bool = False,
82
+ reserved_memory_mb: Optional[float] = None) -> ServerConfig:
80
83
  """Compute the server config based on environment.
81
84
 
82
85
  We have different assumptions for the resources in different deployment
@@ -110,7 +113,11 @@ def compute_server_config(deploy: bool,
110
113
  process after API server was introduced.
111
114
  """
112
115
  cpu_count = common_utils.get_cpu_count()
116
+ logger.debug(f'CPU count: {cpu_count}')
113
117
  mem_size_gb = common_utils.get_mem_size_gb()
118
+ if reserved_memory_mb is not None:
119
+ mem_size_gb -= (reserved_memory_mb / 1024)
120
+ logger.debug(f'Memory size: {mem_size_gb}GB')
114
121
  max_parallel_for_long = _max_long_worker_parallism(cpu_count,
115
122
  mem_size_gb,
116
123
  local=not deploy)
@@ -140,7 +147,12 @@ def compute_server_config(deploy: bool,
140
147
  burstable_parallel_for_short = _BURSTABLE_WORKERS_FOR_LOCAL
141
148
  # Runs in low resource mode if the available memory is less than
142
149
  # server_constants.MIN_AVAIL_MEM_GB.
143
- if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
150
+ # pylint: disable=import-outside-toplevel
151
+ import sky.jobs.utils as job_utils
152
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
153
+ if job_utils.is_consolidation_mode() else
154
+ server_constants.MIN_AVAIL_MEM_GB)
155
+ if not deploy and mem_size_gb < max_memory:
144
156
  # Permanent worker process may have significant memory consumption
145
157
  # (~350MB per worker) after running commands like `sky check`, so we
146
158
  # don't start any permanent workers in low resource local mode. This
@@ -151,25 +163,29 @@ def compute_server_config(deploy: bool,
151
163
  # permanently because it never exits.
152
164
  max_parallel_for_long = 0
153
165
  max_parallel_for_short = 0
154
- logger.warning(
155
- 'SkyPilot API server will run in low resource mode because '
156
- 'the available memory is less than '
157
- f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
166
+ if not quiet:
167
+ logger.warning(
168
+ 'SkyPilot API server will run in low resource mode because '
169
+ 'the available memory is less than '
170
+ f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
158
171
  elif max_db_connections is not None:
159
172
  if max_parallel_all_workers > max_db_connections:
160
- logger.warning(
161
- f'Max parallel all workers ({max_parallel_all_workers}) '
162
- f'is greater than max db connections ({max_db_connections}). '
163
- 'Increase the number of max db connections to '
164
- f'at least {max_parallel_all_workers} for optimal performance.')
173
+ if not quiet:
174
+ logger.warning(
175
+ f'Max parallel all workers ({max_parallel_all_workers}) '
176
+ 'is greater than max db connections '
177
+ f'({max_db_connections}). Increase the number of max db '
178
+ f'connections to at least {max_parallel_all_workers} for '
179
+ 'optimal performance.')
165
180
  else:
166
181
  num_db_connections_per_worker = 1
167
182
 
168
- logger.info(
169
- f'SkyPilot API server will start {num_server_workers} server processes '
170
- f'with {max_parallel_for_long} background workers for long requests '
171
- f'and will allow at max {max_parallel_for_short} short requests in '
172
- f'parallel.')
183
+ if not quiet:
184
+ logger.info(
185
+ f'SkyPilot API server will start {num_server_workers} server '
186
+ f'processes with {max_parallel_for_long} background workers for '
187
+ f'long requests and will allow at max {max_parallel_for_short} '
188
+ 'short requests in parallel.')
173
189
  return ServerConfig(
174
190
  num_server_workers=num_server_workers,
175
191
  queue_backend=queue_backend,
@@ -190,10 +206,15 @@ def _max_long_worker_parallism(cpu_count: int,
190
206
  local=False) -> int:
191
207
  """Max parallelism for long workers."""
192
208
  # Reserve min available memory to avoid OOM.
193
- available_mem = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
209
+ # pylint: disable=import-outside-toplevel
210
+ import sky.jobs.utils as job_utils
211
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
212
+ if job_utils.is_consolidation_mode() else
213
+ server_constants.MIN_AVAIL_MEM_GB)
214
+ available_mem = max(0, mem_size_gb - max_memory)
194
215
  cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
195
216
  mem_based_max_parallel = int(available_mem * _MAX_MEM_PERCENT_FOR_BLOCKING /
196
- _LONG_WORKER_MEM_GB)
217
+ LONG_WORKER_MEM_GB)
197
218
  n = max(_MIN_LONG_WORKERS,
198
219
  min(cpu_based_max_parallel, mem_based_max_parallel))
199
220
  if local:
@@ -201,12 +222,25 @@ def _max_long_worker_parallism(cpu_count: int,
201
222
  return n
202
223
 
203
224
 
225
+ def _get_min_short_workers() -> int:
226
+ """Min number of short workers."""
227
+ daemon_count = 0
228
+ for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
229
+ if not daemon.should_skip():
230
+ daemon_count += 1
231
+ return _MIN_IDLE_SHORT_WORKERS + daemon_count
232
+
233
+
204
234
  def _max_short_worker_parallism(mem_size_gb: float,
205
235
  long_worker_parallism: int) -> int:
206
236
  """Max parallelism for short workers."""
207
237
  # Reserve memory for long workers and min available memory.
208
- reserved_mem = server_constants.MIN_AVAIL_MEM_GB + (long_worker_parallism *
209
- _LONG_WORKER_MEM_GB)
238
+ # pylint: disable=import-outside-toplevel
239
+ import sky.jobs.utils as job_utils
240
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
241
+ if job_utils.is_consolidation_mode() else
242
+ server_constants.MIN_AVAIL_MEM_GB)
243
+ reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
210
244
  available_mem = max(0, mem_size_gb - reserved_mem)
211
- n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
245
+ n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
212
246
  return n
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 17
13
+ API_VERSION = 25
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
@@ -34,6 +34,7 @@ VERSION_HEADER = 'X-SkyPilot-Version'
34
34
  REQUEST_NAME_PREFIX = 'sky.'
35
35
  # The memory (GB) that SkyPilot tries to not use to prevent OOM.
36
36
  MIN_AVAIL_MEM_GB = 2
37
+ MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE = 4
37
38
  # Default encoder/decoder handler name.
38
39
  DEFAULT_HANDLER_NAME = 'default'
39
40
  # The path to the API request database.
@@ -60,3 +61,10 @@ DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
60
61
 
61
62
  # The interval (seconds) for the event to be restarted in the background.
62
63
  DAEMON_RESTART_INTERVAL_SECONDS = 20
64
+
65
+ # Cookie header for stream request id.
66
+ STREAM_REQUEST_HEADER = 'X-SkyPilot-Stream-Request-ID'
67
+
68
+ # Valid empty values for pickled fields (base64-encoded pickled None)
69
+ # base64.b64encode(pickle.dumps(None)).decode('utf-8')
70
+ EMPTY_PICKLED_VALUE = 'gAROLg=='
sky/server/daemons.py CHANGED
@@ -1,18 +1,30 @@
1
1
  """Internal server daemons that run in the background."""
2
+ import atexit
2
3
  import dataclasses
3
4
  import os
4
5
  import time
6
+ import typing
5
7
  from typing import Callable
6
8
 
7
9
  from sky import sky_logging
8
10
  from sky import skypilot_config
11
+ from sky.adaptors import common as adaptors_common
9
12
  from sky.server import constants as server_constants
13
+ from sky.server.requests import request_names
14
+ from sky.skylet import constants
10
15
  from sky.utils import annotations
11
- from sky.utils import common
16
+ from sky.utils import common_utils
12
17
  from sky.utils import env_options
18
+ from sky.utils import locks
19
+ from sky.utils import subprocess_utils
13
20
  from sky.utils import timeline
14
21
  from sky.utils import ux_utils
15
22
 
23
+ if typing.TYPE_CHECKING:
24
+ import pathlib
25
+ else:
26
+ pathlib = adaptors_common.LazyImport('pathlib')
27
+
16
28
  logger = sky_logging.init_logger(__name__)
17
29
 
18
30
 
@@ -25,7 +37,7 @@ class InternalRequestDaemon:
25
37
  """Internal daemon that runs an event in the background."""
26
38
 
27
39
  id: str
28
- name: str
40
+ name: request_names.RequestName
29
41
  event_fn: Callable[[], None]
30
42
  default_log_level: str = 'INFO'
31
43
  should_skip: Callable[[], bool] = _default_should_skip
@@ -37,9 +49,11 @@ class InternalRequestDaemon:
37
49
  try:
38
50
  # Refresh config within the while loop.
39
51
  # Since this is a long running daemon,
40
- # reload_config_for_new_request()
52
+ # reload_for_new_request()
41
53
  # is not called in between the event runs.
42
- skypilot_config.safe_reload_config()
54
+ # We don't need to grab the lock here because each of the daemons
55
+ # run in their own process and thus have their own request context.
56
+ skypilot_config.reload_config()
43
57
  # Get the configured log level for the daemon inside the event loop
44
58
  # in case the log level changes after the API server is started.
45
59
  level_str = skypilot_config.get_nested(
@@ -69,10 +83,6 @@ class InternalRequestDaemon:
69
83
  sky_logging.reload_logger()
70
84
  level = self.refresh_log_level()
71
85
  self.event_fn()
72
- # Clear request level cache after each run to avoid
73
- # using too much memory.
74
- annotations.clear_request_level_cache()
75
- timeline.save_timeline()
76
86
  except Exception: # pylint: disable=broad-except
77
87
  # It is OK to fail to run the event, as the event is not
78
88
  # critical, but we should log the error.
@@ -82,18 +92,28 @@ class InternalRequestDaemon:
82
92
  f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
83
93
  'seconds...')
84
94
  time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
95
+ finally:
96
+ # Clear request level cache after each run to avoid
97
+ # using too much memory.
98
+ annotations.clear_request_level_cache()
99
+ timeline.save_timeline()
100
+ # Kill all children processes related to this request.
101
+ # Each executor handles a single request, so we can safely
102
+ # kill all children processes related to this request.
103
+ subprocess_utils.kill_children_processes()
104
+ common_utils.release_memory()
85
105
 
86
106
 
87
107
  def refresh_cluster_status_event():
88
108
  """Periodically refresh the cluster status."""
89
109
  # pylint: disable=import-outside-toplevel
90
- from sky import core
110
+ from sky.backends import backend_utils
91
111
 
92
112
  logger.info('=== Refreshing cluster status ===')
93
113
  # This periodically refresh will hold the lock for the cluster being
94
114
  # refreshed, but it is OK because other operations will just wait for
95
115
  # the lock and get the just refreshed status without refreshing again.
96
- core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
116
+ backend_utils.refresh_cluster_records()
97
117
  logger.info('Status refreshed. Sleeping '
98
118
  f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
99
119
  ' seconds for the next refresh...\n')
@@ -117,25 +137,75 @@ def refresh_volume_status_event():
117
137
  time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
118
138
 
119
139
 
140
+ _managed_job_consolidation_mode_lock = None
141
+
142
+
143
+ # Attempt to gracefully release the lock when the process exits.
144
+ # If this fails, it's okay, the lock will be released when the process dies.
145
+ def _release_managed_job_consolidation_mode_lock() -> None:
146
+ global _managed_job_consolidation_mode_lock
147
+ if _managed_job_consolidation_mode_lock is not None:
148
+ _managed_job_consolidation_mode_lock.release()
149
+ _managed_job_consolidation_mode_lock = None
150
+
151
+
152
+ atexit.register(_release_managed_job_consolidation_mode_lock)
153
+
154
+
120
155
  def managed_job_status_refresh_event():
121
156
  """Refresh the managed job status for controller consolidation mode."""
122
157
  # pylint: disable=import-outside-toplevel
158
+ from sky.jobs import constants as managed_job_constants
123
159
  from sky.jobs import utils as managed_job_utils
124
- from sky.utils import controller_utils
125
160
 
126
- # We run the recovery logic before starting the event loop as those two are
127
- # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
128
- if controller_utils.high_availability_specified(
129
- controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
161
+ global _managed_job_consolidation_mode_lock
162
+ if _managed_job_consolidation_mode_lock is None:
163
+ _managed_job_consolidation_mode_lock = locks.get_lock(
164
+ managed_job_constants.CONSOLIDATION_MODE_LOCK_ID)
165
+
166
+ # Touch the signal file here to avoid conflict with
167
+ # update_managed_jobs_statuses. Although we run
168
+ # ha_recovery_for_consolidation_mode before checking the job statuses
169
+ # (events.ManagedJobEvent), update_managed_jobs_statuses is also called in
170
+ # cancel_jobs_by_id.
171
+ # We also need to make sure that new controllers are not started until we
172
+ # acquire the consolidation mode lock, since if we have controllers on both
173
+ # the new and old API server during a rolling update, calling
174
+ # update_managed_jobs_statuses on the old API server could lead to
175
+ # FAILED_CONTROLLER.
176
+ signal_file = pathlib.Path(
177
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE).expanduser()
178
+ try:
179
+ signal_file.touch()
180
+
181
+ # Make sure the lock is acquired for this process before proceeding to
182
+ # do recovery. This will block if another API server is still running,
183
+ # but should proceed once it is terminated and releases the lock.
184
+ if not _managed_job_consolidation_mode_lock.is_locked():
185
+ logger.info('Acquiring the consolidation mode lock: '
186
+ f'{_managed_job_consolidation_mode_lock}')
187
+ _managed_job_consolidation_mode_lock.acquire()
188
+ logger.info('Lock acquired!')
189
+ # We don't explicitly release the lock until the process exits.
190
+ # Even if _release_managed_job_consolidation_mode_lock is not called,
191
+ # the lock should be released when the process dies (either due to the
192
+ # advisory file lock being released or the postgres session dying).
193
+
194
+ # We run the recovery logic before checking the job statuses as those
195
+ # two are conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for
196
+ # details.
130
197
  managed_job_utils.ha_recovery_for_consolidation_mode()
198
+ finally:
199
+ # Now, we should be sure that this is the only API server, we have
200
+ # started the new controllers and unclaimed all the jobs, and we are
201
+ # ready to update the job statuses.
202
+ signal_file.unlink()
131
203
 
132
204
  # After recovery, we start the event loop.
133
205
  from sky.skylet import events
134
206
  refresh_event = events.ManagedJobEvent()
135
- scheduling_event = events.ManagedJobSchedulingEvent()
136
207
  logger.info('=== Running managed job event ===')
137
208
  refresh_event.run()
138
- scheduling_event.run()
139
209
  time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
140
210
 
141
211
 
@@ -150,14 +220,10 @@ def _serve_status_refresh_event(pool: bool):
150
220
  """Refresh the sky serve status for controller consolidation mode."""
151
221
  # pylint: disable=import-outside-toplevel
152
222
  from sky.serve import serve_utils
153
- from sky.utils import controller_utils
154
223
 
155
224
  # We run the recovery logic before starting the event loop as those two are
156
225
  # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
157
- controller = controller_utils.get_controller_for_pool(pool)
158
- if controller_utils.high_availability_specified(
159
- controller.value.cluster_name):
160
- serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
226
+ serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
161
227
 
162
228
  # After recovery, we start the event loop.
163
229
  from sky.skylet import events
@@ -196,26 +262,31 @@ INTERNAL_REQUEST_DAEMONS = [
196
262
  # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
197
263
  # set to updated status automatically, without showing users the hint of
198
264
  # cluster being stopped or down when `sky status -r` is called.
199
- InternalRequestDaemon(id='skypilot-status-refresh-daemon',
200
- name='status-refresh',
201
- event_fn=refresh_cluster_status_event,
202
- default_log_level='DEBUG'),
265
+ InternalRequestDaemon(
266
+ id='skypilot-status-refresh-daemon',
267
+ name=request_names.RequestName.REQUEST_DAEMON_STATUS_REFRESH,
268
+ event_fn=refresh_cluster_status_event,
269
+ default_log_level='DEBUG'),
203
270
  # Volume status refresh daemon to update the volume status periodically.
204
- InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
205
- name='volume-refresh',
206
- event_fn=refresh_volume_status_event),
271
+ InternalRequestDaemon(
272
+ id='skypilot-volume-status-refresh-daemon',
273
+ name=request_names.RequestName.REQUEST_DAEMON_VOLUME_REFRESH,
274
+ event_fn=refresh_volume_status_event),
207
275
  InternalRequestDaemon(id='managed-job-status-refresh-daemon',
208
- name='managed-job-status-refresh',
276
+ name=request_names.RequestName.
277
+ REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH,
209
278
  event_fn=managed_job_status_refresh_event,
210
279
  should_skip=should_skip_managed_job_status_refresh),
211
- InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
212
- name='sky-serve-status-refresh',
213
- event_fn=sky_serve_status_refresh_event,
214
- should_skip=should_skip_sky_serve_status_refresh),
215
- InternalRequestDaemon(id='pool-status-refresh-daemon',
216
- name='pool-status-refresh',
217
- event_fn=pool_status_refresh_event,
218
- should_skip=should_skip_pool_status_refresh),
280
+ InternalRequestDaemon(
281
+ id='sky-serve-status-refresh-daemon',
282
+ name=request_names.RequestName.REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH,
283
+ event_fn=sky_serve_status_refresh_event,
284
+ should_skip=should_skip_sky_serve_status_refresh),
285
+ InternalRequestDaemon(
286
+ id='pool-status-refresh-daemon',
287
+ name=request_names.RequestName.REQUEST_DAEMON_POOL_STATUS_REFRESH,
288
+ event_fn=pool_status_refresh_event,
289
+ should_skip=should_skip_pool_status_refresh),
219
290
  ]
220
291
 
221
292