skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -23,10 +23,10 @@ from sky.clouds import gcp
23
23
  from sky.data import data_utils
24
24
  from sky.data import storage as storage_lib
25
25
  from sky.jobs import constants as managed_job_constants
26
- from sky.jobs import state as managed_job_state
27
26
  from sky.provision.kubernetes import constants as kubernetes_constants
28
27
  from sky.serve import constants as serve_constants
29
28
  from sky.serve import serve_state
29
+ from sky.server import config as server_config
30
30
  from sky.setup_files import dependencies
31
31
  from sky.skylet import constants
32
32
  from sky.skylet import log_lib
@@ -72,7 +72,8 @@ class _ControllerSpec:
72
72
  """Spec for skypilot controllers."""
73
73
  controller_type: str
74
74
  name: str
75
- cluster_name: str
75
+ _cluster_name_func: Callable[[], str]
76
+ _cluster_name_from_server: Optional[str] # For client-side only
76
77
  in_progress_hint: Callable[[bool], str]
77
78
  decline_cancel_hint: str
78
79
  _decline_down_when_failed_to_fetch_status_hint: str
@@ -93,6 +94,24 @@ class _ControllerSpec:
93
94
  return self._check_cluster_name_hint.format(
94
95
  cluster_name=self.cluster_name)
95
96
 
97
+ @property
98
+ def cluster_name(self) -> str:
99
+ """The cluster name of the controller.
100
+
101
+ On the server-side, the cluster name is the actual cluster name,
102
+ which is read from common.(JOB|SKY_SERVE)_CONTROLLER_NAME.
103
+
104
+ On the client-side, the cluster name may not be accurate,
105
+ as we may not know the exact name, because we are missing
106
+ the server-side common.SERVER_ID. We have to wait until
107
+ we get the actual cluster name from the server.
108
+ """
109
+ return (self._cluster_name_from_server if self._cluster_name_from_server
110
+ is not None else self._cluster_name_func())
111
+
112
+ def set_cluster_name_from_server(self, cluster_name: str) -> None:
113
+ self._cluster_name_from_server = cluster_name
114
+
96
115
 
97
116
  # TODO: refactor controller class to not be an enum.
98
117
  class Controllers(enum.Enum):
@@ -102,7 +121,8 @@ class Controllers(enum.Enum):
102
121
  JOBS_CONTROLLER = _ControllerSpec(
103
122
  controller_type='jobs',
104
123
  name='managed jobs controller',
105
- cluster_name=common.JOB_CONTROLLER_NAME,
124
+ _cluster_name_func=lambda: common.JOB_CONTROLLER_NAME,
125
+ _cluster_name_from_server=None,
106
126
  in_progress_hint=lambda _:
107
127
  ('* {job_info}To see all managed jobs: '
108
128
  f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
@@ -133,7 +153,8 @@ class Controllers(enum.Enum):
133
153
  SKY_SERVE_CONTROLLER = _ControllerSpec(
134
154
  controller_type='serve',
135
155
  name='serve controller',
136
- cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
156
+ _cluster_name_func=lambda: common.SKY_SERVE_CONTROLLER_NAME,
157
+ _cluster_name_from_server=None,
137
158
  in_progress_hint=(
138
159
  lambda pool:
139
160
  (f'* To see detailed pool status: {colorama.Style.BRIGHT}'
@@ -166,7 +187,9 @@ class Controllers(enum.Enum):
166
187
  default_autostop_config=serve_constants.CONTROLLER_AUTOSTOP)
167
188
 
168
189
  @classmethod
169
- def from_name(cls, name: Optional[str]) -> Optional['Controllers']:
190
+ def from_name(cls,
191
+ name: Optional[str],
192
+ expect_exact_match: bool = True) -> Optional['Controllers']:
170
193
  """Check if the cluster name is a controller name.
171
194
 
172
195
  Returns:
@@ -187,7 +210,11 @@ class Controllers(enum.Enum):
187
210
  elif name.startswith(common.JOB_CONTROLLER_PREFIX):
188
211
  controller = cls.JOBS_CONTROLLER
189
212
  prefix = common.JOB_CONTROLLER_PREFIX
190
- if controller is not None and name != controller.value.cluster_name:
213
+
214
+ if controller is not None and expect_exact_match:
215
+ assert name == controller.value.cluster_name, (
216
+ name, controller.value.cluster_name)
217
+ elif controller is not None and name != controller.value.cluster_name:
191
218
  # The client-side cluster_name is not accurate. Assume that `name`
192
219
  # is the actual cluster name, so need to set the controller's
193
220
  # cluster name to the input name.
@@ -201,7 +228,7 @@ class Controllers(enum.Enum):
201
228
  prefix)
202
229
 
203
230
  # Update the cluster name.
204
- controller.value.cluster_name = name
231
+ controller.value.set_cluster_name_from_server(name)
205
232
  return controller
206
233
 
207
234
  @classmethod
@@ -228,10 +255,21 @@ def get_controller_for_pool(pool: bool) -> Controllers:
228
255
  def high_availability_specified(cluster_name: Optional[str]) -> bool:
229
256
  """Check if the controller high availability is specified in user config.
230
257
  """
231
- controller = Controllers.from_name(cluster_name)
258
+ controller = Controllers.from_name(cluster_name, expect_exact_match=False)
232
259
  if controller is None:
233
260
  return False
234
261
 
262
+ if controller.value.controller_type == 'jobs':
263
+ # pylint: disable-next=import-outside-toplevel
264
+ from sky.jobs import utils as managed_job_utils
265
+ if managed_job_utils.is_consolidation_mode():
266
+ return True
267
+ elif controller.value.controller_type == 'serve':
268
+ # pylint: disable-next=import-outside-toplevel
269
+ from sky.serve import serve_utils
270
+ if serve_utils.is_consolidation_mode():
271
+ return True
272
+
235
273
  if skypilot_config.loaded():
236
274
  return skypilot_config.get_nested((controller.value.controller_type,
237
275
  'controller', 'high_availability'),
@@ -400,7 +438,7 @@ def check_cluster_name_not_controller(
400
438
  Returns:
401
439
  None, if the cluster name is not a controller name.
402
440
  """
403
- controller = Controllers.from_name(cluster_name)
441
+ controller = Controllers.from_name(cluster_name, expect_exact_match=False)
404
442
  if controller is not None:
405
443
  msg = controller.value.check_cluster_name_hint
406
444
  if operation_str is not None:
@@ -495,6 +533,9 @@ def shared_controller_vars_to_fill(
495
533
  # before popping allowed_contexts. If it is not on Kubernetes,
496
534
  # we may be able to use allowed_contexts.
497
535
  local_user_config.pop('allowed_contexts', None)
536
+ # Remove api_server config so that the controller does not try to use
537
+ # a remote API server.
538
+ local_user_config.pop('api_server', None)
498
539
  with tempfile.NamedTemporaryFile(
499
540
  delete=False,
500
541
  suffix=_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX) as temp_file:
@@ -528,7 +569,15 @@ def shared_controller_vars_to_fill(
528
569
  # with a remote API server.
529
570
  constants.USING_REMOTE_API_SERVER_ENV_VAR: str(
530
571
  common_utils.get_using_remote_api_server()),
572
+ constants.IS_SKYPILOT_SERVE_CONTROLLER:
573
+ ('true'
574
+ if controller == Controllers.SKY_SERVE_CONTROLLER else 'false'),
531
575
  })
576
+ override_concurrent_launches = os.environ.get(
577
+ constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES, None)
578
+ if override_concurrent_launches is not None:
579
+ env_vars[constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES] = str(
580
+ int(override_concurrent_launches))
532
581
  if skypilot_config.loaded():
533
582
  # Only set the SKYPILOT_CONFIG env var if the user has a config file.
534
583
  env_vars[
@@ -609,15 +658,16 @@ def get_controller_resources(
609
658
  controller_resources_to_use: resources.Resources = list(
610
659
  controller_resources)[0]
611
660
 
612
- controller_record = global_user_state.get_cluster_from_name(
661
+ controller_handle = global_user_state.get_handle_from_cluster_name(
613
662
  controller.value.cluster_name)
614
- if controller_record is not None:
615
- handle = controller_record.get('handle', None)
616
- if handle is not None:
663
+ if controller_handle is not None:
664
+ if controller_handle is not None:
617
665
  # Use the existing resources, but override the autostop config with
618
666
  # the one currently specified in the config.
619
- controller_resources_to_use = handle.launched_resources.copy(
620
- autostop=controller_resources_config_copied.get('autostop'))
667
+ controller_resources_to_use = (
668
+ controller_handle.launched_resources.copy(
669
+ autostop=controller_resources_config_copied.get('autostop'))
670
+ )
621
671
 
622
672
  # If the controller and replicas are from the same cloud (and region/zone),
623
673
  # it should provide better connectivity. We will let the controller choose
@@ -714,6 +764,17 @@ def get_controller_resources(
714
764
  return result
715
765
 
716
766
 
767
+ def get_controller_mem_size_gb() -> float:
768
+ try:
769
+ with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
770
+ 'r',
771
+ encoding='utf-8') as f:
772
+ return float(f.read())
773
+ except FileNotFoundError:
774
+ pass
775
+ return common_utils.get_mem_size_gb()
776
+
777
+
717
778
  def _setup_proxy_command_on_controller(
718
779
  controller_launched_cloud: 'clouds.Cloud',
719
780
  user_config: Dict[str, Any]) -> config_utils.Config:
@@ -1174,77 +1235,175 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
1174
1235
 
1175
1236
  # ======================= Resources Management Functions =======================
1176
1237
 
1177
- # Based on testing, assume a running job process uses 350MB memory. We use the
1178
- # same estimation for service controller process.
1179
- JOB_MEMORY_MB = 350
1180
- # Monitoring process for service is 1GB. This is based on an old estimation but
1181
- # we keep it here for now.
1238
+ # Monitoring process for service is 512MB. This is based on an old
1239
+ # estimation but we keep it here for now.
1182
1240
  # TODO(tian): Remeasure this.
1183
- SERVE_MONITORING_MEMORY_MB = 1024
1184
- # The ratio of service controller process to job process. We will treat each
1185
- # service as SERVE_PROC_RATIO job processes.
1186
- SERVE_PROC_RATIO = SERVE_MONITORING_MEMORY_MB / JOB_MEMORY_MB
1187
- # Past 2000 simultaneous jobs, we become unstable.
1188
- # See https://github.com/skypilot-org/skypilot/issues/4649.
1189
- MAX_JOB_LIMIT = 2000
1190
- # Number of ongoing launches launches allowed per CPU, for managed jobs.
1191
- JOB_LAUNCHES_PER_CPU = 4
1192
- # Number of ongoing launches launches allowed per CPU, for services. This is
1193
- # also based on an old estimation, but SKyServe indeed spawn a new process
1194
- # for each launch operation, so it should be slightly more resources demanding
1195
- # than managed jobs.
1196
- SERVE_LAUNCHES_PER_CPU = 2
1197
- # The ratio of service launch to job launch. This is inverted as the parallelism
1198
- # is determined by 1 / LAUNCHES_PER_CPU.
1199
- SERVE_LAUNCH_RATIO = JOB_LAUNCHES_PER_CPU / SERVE_LAUNCHES_PER_CPU
1241
+ SERVE_MONITORING_MEMORY_MB = 512
1242
+ # The resource consumption ratio of service launch to serve down.
1243
+ SERVE_LAUNCH_RATIO = 2.0
1200
1244
 
1201
1245
  # The _RESOURCES_LOCK should be held whenever we are checking the parallelism
1202
1246
  # control or updating the schedule_state of any job or service. Any code that
1203
1247
  # takes this lock must conclude by calling maybe_schedule_next_jobs.
1204
1248
  _RESOURCES_LOCK = '~/.sky/locks/controller_resources.lock'
1205
1249
 
1206
-
1207
- @annotations.lru_cache(scope='global', maxsize=1)
1208
- def get_resources_lock_path() -> str:
1209
- path = os.path.expanduser(_RESOURCES_LOCK)
1210
- os.makedirs(os.path.dirname(path), exist_ok=True)
1211
- return path
1250
+ # keep 2GB reserved after the controllers
1251
+ MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
1252
+
1253
+ # NOTE: In the current implementation, we only consider the memory
1254
+ # The ratio of resources consumption for managed jobs and pool/serve.
1255
+ # This measures pool_resources / jobs_resources. If 2 GB memory is allocated to
1256
+ # jobs, then 2 * POOL_JOBS_RESOURCES_RATIO GB memory is allocated to pool/serve.
1257
+ POOL_JOBS_RESOURCES_RATIO = 1
1258
+ # Number of ongoing launches launches allowed per worker. Can probably be
1259
+ # increased a bit to around 16 but keeping it lower to just to be safe
1260
+ LAUNCHES_PER_WORKER = 8
1261
+ # Number of ongoing launches allowed per service. Can probably be increased
1262
+ # a bit as well.
1263
+ LAUNCHES_PER_SERVICE = 4
1264
+
1265
+ # Based on testing, each worker takes around 200-300MB memory. Keeping it
1266
+ # higher to be safe.
1267
+ JOB_WORKER_MEMORY_MB = 400
1268
+ # this can probably be increased to around 300-400 but keeping it lower to just
1269
+ # to be safe
1270
+ MAX_JOBS_PER_WORKER = 200
1271
+ # Maximum number of controllers that can be running. Hard to handle more than
1272
+ # 512 launches at once.
1273
+ MAX_CONTROLLERS = 512 // LAUNCHES_PER_WORKER
1274
+ # Limit the number of jobs that can be running at once on the entire jobs
1275
+ # controller cluster. It's hard to handle cancellation of more than 2000 jobs at
1276
+ # once.
1277
+ # TODO(cooperc): Once we eliminate static bottlenecks (e.g. sqlite), remove this
1278
+ # hardcoded max limit.
1279
+ MAX_TOTAL_RUNNING_JOBS = 2000
1280
+
1281
+
1282
+ def compute_memory_reserved_for_controllers(
1283
+ reserve_for_controllers: bool, reserve_extra_for_pool: bool) -> float:
1284
+ reserved_memory_mb = 0.0
1285
+ if reserve_for_controllers:
1286
+ reserved_memory_mb = float(MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB)
1287
+ if reserve_extra_for_pool:
1288
+ reserved_memory_mb *= (1. + POOL_JOBS_RESOURCES_RATIO)
1289
+ return reserved_memory_mb
1290
+
1291
+
1292
+ def _get_total_usable_memory_mb(pool: bool, consolidation_mode: bool) -> float:
1293
+ controller_reserved = compute_memory_reserved_for_controllers(
1294
+ reserve_for_controllers=True, reserve_extra_for_pool=pool)
1295
+ total_memory_mb = (common_utils.get_mem_size_gb() * 1024 -
1296
+ controller_reserved)
1297
+ if not consolidation_mode:
1298
+ return total_memory_mb
1299
+ config = server_config.compute_server_config(
1300
+ deploy=True, quiet=True, reserved_memory_mb=controller_reserved)
1301
+ used = 0.0
1302
+ used += ((config.long_worker_config.garanteed_parallelism +
1303
+ config.long_worker_config.burstable_parallelism) *
1304
+ server_config.LONG_WORKER_MEM_GB * 1024)
1305
+ used += ((config.short_worker_config.garanteed_parallelism +
1306
+ config.short_worker_config.burstable_parallelism) *
1307
+ server_config.SHORT_WORKER_MEM_GB * 1024)
1308
+ return total_memory_mb - used
1309
+
1310
+
1311
+ def _is_consolidation_mode(pool: bool) -> bool:
1312
+ return skypilot_config.get_nested(
1313
+ ('jobs' if pool else 'serve', 'controller', 'consolidation_mode'),
1314
+ default_value=False)
1212
1315
 
1213
1316
 
1214
1317
  @annotations.lru_cache(scope='request')
1215
- def _get_job_parallelism() -> int:
1216
- job_memory = JOB_MEMORY_MB * 1024 * 1024
1217
- job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
1218
- return max(job_limit, 1)
1318
+ def _get_parallelism(pool: bool, raw_resource_per_unit: float) -> int:
1319
+ """Returns the number of jobs controllers / services that should be running.
1219
1320
 
1321
+ This is the number of controllers / services that should be running
1322
+ to maximize resource utilization.
1220
1323
 
1221
- @annotations.lru_cache(scope='request')
1222
- def _get_launch_parallelism() -> int:
1223
- cpus = os.cpu_count()
1224
- return cpus * JOB_LAUNCHES_PER_CPU if cpus is not None else 1
1324
+ In consolidation mode, we use the existing API server so our resource
1325
+ requirements are just for the job controllers / services. We try taking
1326
+ up as much memory as possible left over from the API server.
1225
1327
 
1328
+ In non-consolidation mode, we have to take into account the memory of the
1329
+ API server workers. We limit to only 8 launches per worker, so our logic is
1330
+ each controller will take CONTROLLER_MEMORY_MB + 8 * WORKER_MEMORY_MB. We
1331
+ leave some leftover room for ssh codegen and ray status overhead.
1332
+ """
1333
+ consolidation_mode = _is_consolidation_mode(pool)
1226
1334
 
1227
- def can_provision() -> bool:
1228
- # We always prioritize terminating over provisioning, to save the cost on
1229
- # idle resources.
1230
- if serve_state.total_number_scheduled_to_terminate_replicas() > 0:
1231
- return False
1232
- return can_terminate()
1335
+ total_memory_mb = _get_total_usable_memory_mb(pool, consolidation_mode)
1336
+
1337
+ # In consolidation mode, we assume the API server is running in deployment
1338
+ # mode, hence resource management (i.e. how many requests are allowed) is
1339
+ # done by the API server.
1340
+ resource_per_unit_worker = 0.
1341
+ # Otherwise, it runs a local API server on the jobs/serve controller.
1342
+ # We need to do the resource management ourselves.
1343
+ if not consolidation_mode:
1344
+ launches_per_worker = (LAUNCHES_PER_WORKER
1345
+ if pool else LAUNCHES_PER_SERVICE)
1346
+ resource_per_unit_worker = (launches_per_worker *
1347
+ server_config.LONG_WORKER_MEM_GB * 1024)
1233
1348
 
1349
+ # If running pool on jobs controller, we need to account for the resources
1350
+ # consumed by the jobs.
1351
+ ratio = (1. + POOL_JOBS_RESOURCES_RATIO) if pool else 1.
1352
+ resource_per_unit = ratio * (raw_resource_per_unit +
1353
+ resource_per_unit_worker)
1234
1354
 
1235
- def can_start_new_process() -> bool:
1236
- num_procs = (serve_state.get_num_services() * SERVE_PROC_RATIO +
1237
- managed_job_state.get_num_alive_jobs())
1238
- return num_procs < _get_job_parallelism()
1355
+ return max(int(total_memory_mb / resource_per_unit), 1)
1239
1356
 
1240
1357
 
1241
- # We limit the number of terminating replicas to the number of CPUs. This is
1242
- # just a temporary solution to avoid overwhelming the controller. After one job
1243
- # controller PR, we should use API server to handle resources management.
1244
- def can_terminate() -> bool:
1358
+ def get_number_of_jobs_controllers() -> int:
1359
+ return min(
1360
+ MAX_CONTROLLERS,
1361
+ _get_parallelism(pool=True, raw_resource_per_unit=JOB_WORKER_MEMORY_MB))
1362
+
1363
+
1364
+ @annotations.lru_cache(scope='global', maxsize=1)
1365
+ def get_resources_lock_path() -> str:
1366
+ path = os.path.expanduser(_RESOURCES_LOCK)
1367
+ os.makedirs(os.path.dirname(path), exist_ok=True)
1368
+ return path
1369
+
1370
+
1371
+ def _get_number_of_services(pool: bool) -> int:
1372
+ return _get_parallelism(pool=pool,
1373
+ raw_resource_per_unit=SERVE_MONITORING_MEMORY_MB *
1374
+ POOL_JOBS_RESOURCES_RATIO)
1375
+
1376
+
1377
+ @annotations.lru_cache(scope='request')
1378
+ def _get_request_parallelism(pool: bool) -> int:
1379
+ # NOTE(dev): One smoke test depends on this value.
1380
+ # tests/smoke_tests/test_sky_serve.py::test_skyserve_new_autoscaler_update
1381
+ # assumes 4 concurrent launches.
1382
+ override_concurrent_launches = os.environ.get(
1383
+ constants.SERVE_OVERRIDE_CONCURRENT_LAUNCHES, None)
1384
+ if override_concurrent_launches is not None and not pool:
1385
+ return int(override_concurrent_launches)
1386
+ # Limitation per service x number of services
1387
+ launches_per_worker = (LAUNCHES_PER_WORKER
1388
+ if pool else LAUNCHES_PER_SERVICE)
1389
+ return (launches_per_worker * POOL_JOBS_RESOURCES_RATIO *
1390
+ _get_number_of_services(pool))
1391
+
1392
+
1393
+ def can_provision(pool: bool) -> bool:
1394
+ # TODO(tian): probe API server to see if there is any pending provision
1395
+ # requests.
1396
+ return can_terminate(pool)
1397
+
1398
+
1399
+ def can_start_new_process(pool: bool) -> bool:
1400
+ return serve_state.get_num_services() < _get_number_of_services(pool)
1401
+
1402
+
1403
+ def can_terminate(pool: bool) -> bool:
1404
+ # TODO(tian): probe API server to see if there is any pending terminate
1405
+ # requests.
1245
1406
  num_terminating = (
1246
- serve_state.total_number_provisioning_replicas() * SERVE_LAUNCH_RATIO +
1247
- # Each terminate process will take roughly the same CPUs as job launch.
1248
- serve_state.total_number_terminating_replicas() +
1249
- managed_job_state.get_num_launching_jobs())
1250
- return num_terminating < _get_launch_parallelism()
1407
+ serve_state.total_number_provisioning_replicas() +
1408
+ serve_state.total_number_terminating_replicas() / SERVE_LAUNCH_RATIO)
1409
+ return num_terminating < _get_request_parallelism(pool)
sky/utils/db/db_utils.py CHANGED
@@ -7,15 +7,17 @@ import pathlib
7
7
  import sqlite3
8
8
  import threading
9
9
  import typing
10
- from typing import Any, Callable, Dict, Iterable, Optional
10
+ from typing import Any, Callable, Dict, Iterable, Literal, Optional, Union
11
11
 
12
12
  import aiosqlite
13
13
  import aiosqlite.context
14
14
  import sqlalchemy
15
15
  from sqlalchemy import exc as sqlalchemy_exc
16
+ from sqlalchemy.ext import asyncio as sqlalchemy_async
16
17
 
17
18
  from sky import sky_logging
18
19
  from sky.skylet import constants
20
+ from sky.skylet import runtime_utils
19
21
 
20
22
  logger = sky_logging.init_logger(__name__)
21
23
  if typing.TYPE_CHECKING:
@@ -184,7 +186,7 @@ def add_column_to_table_sqlalchemy(
184
186
  pass
185
187
  else:
186
188
  raise
187
- #postgressql
189
+ #postgresql
188
190
  except sqlalchemy_exc.ProgrammingError as e:
189
191
  if 'already exists' in str(e):
190
192
  pass
@@ -200,6 +202,7 @@ def add_column_to_table_alembic(
200
202
  server_default: Optional[str] = None,
201
203
  copy_from: Optional[str] = None,
202
204
  value_to_replace_existing_entries: Optional[Any] = None,
205
+ index: Optional[bool] = None,
203
206
  ):
204
207
  """Add a column to a table using Alembic operations.
205
208
 
@@ -214,6 +217,8 @@ def add_column_to_table_alembic(
214
217
  copy_from: Column name to copy values from (for existing rows)
215
218
  value_to_replace_existing_entries: Default value for existing NULL
216
219
  entries
220
+ index: If True, create an index on this column. If None, no index
221
+ is created.
217
222
  """
218
223
  from alembic import op # pylint: disable=import-outside-toplevel
219
224
 
@@ -221,7 +226,8 @@ def add_column_to_table_alembic(
221
226
  # Create the column with server_default if provided
222
227
  column = sqlalchemy.Column(column_name,
223
228
  column_type,
224
- server_default=server_default)
229
+ server_default=server_default,
230
+ index=index)
225
231
  op.add_column(table_name, column)
226
232
 
227
233
  # Handle data migration
@@ -353,6 +359,27 @@ class SQLiteConn(threading.local):
353
359
  conn = await self._get_async_conn()
354
360
  return await conn.execute_fetchall(sql, parameters)
355
361
 
362
+ async def execute_get_returning_value_async(
363
+ self,
364
+ sql: str,
365
+ parameters: Optional[Iterable[Any]] = None
366
+ ) -> Optional[sqlite3.Row]:
367
+ conn = await self._get_async_conn()
368
+
369
+ if parameters is None:
370
+ parameters = []
371
+
372
+ def exec_and_get_returning_value(sql: str,
373
+ parameters: Optional[Iterable[Any]]):
374
+ # pylint: disable=protected-access
375
+ row = conn._conn.execute(sql, parameters).fetchone()
376
+ conn._conn.commit()
377
+ return row
378
+
379
+ # pylint: disable=protected-access
380
+ return await conn._execute(exec_and_get_returning_value, sql,
381
+ parameters)
382
+
356
383
  async def close(self):
357
384
  if self._async_conn is not None:
358
385
  await self._async_conn.close()
@@ -375,32 +402,82 @@ def get_max_connections():
375
402
  return _max_connections
376
403
 
377
404
 
378
- def get_engine(db_name: str):
405
+ @typing.overload
406
+ def get_engine(
407
+ db_name: Optional[str],
408
+ async_engine: Literal[False] = False) -> sqlalchemy.engine.Engine:
409
+ ...
410
+
411
+
412
+ @typing.overload
413
+ def get_engine(db_name: Optional[str],
414
+ async_engine: Literal[True]) -> sqlalchemy_async.AsyncEngine:
415
+ ...
416
+
417
+
418
+ def get_engine(
419
+ db_name: Optional[str],
420
+ async_engine: bool = False
421
+ ) -> Union[sqlalchemy.engine.Engine, sqlalchemy_async.AsyncEngine]:
422
+ """Get the engine for the given database name.
423
+
424
+ Args:
425
+ db_name: The name of the database. ONLY used for SQLite. On Postgres,
426
+ we use a single database, which we get from the connection string.
427
+ async_engine: Whether to return an async engine.
428
+ """
379
429
  conn_string = None
380
430
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
381
431
  conn_string = os.environ.get(constants.ENV_VAR_DB_CONNECTION_URI)
382
432
  if conn_string:
433
+ if async_engine:
434
+ conn_string = conn_string.replace('postgresql://',
435
+ 'postgresql+asyncpg://')
383
436
  with _db_creation_lock:
437
+ # We use the same cache for both sync and async engines
438
+ # because we change the conn_string in the async case,
439
+ # so they would not overlap.
384
440
  if conn_string not in _postgres_engine_cache:
441
+ engine_type = 'sync' if not async_engine else 'async'
442
+ logger.debug(
443
+ f'Creating a new postgres {engine_type} engine with '
444
+ f'maximum {_max_connections} connections')
385
445
  if _max_connections == 0:
386
- _postgres_engine_cache[conn_string] = (
387
- sqlalchemy.create_engine(
388
- conn_string, poolclass=sqlalchemy.pool.NullPool))
389
- elif _max_connections == 1:
390
- _postgres_engine_cache[conn_string] = (
391
- sqlalchemy.create_engine(
392
- conn_string, poolclass=sqlalchemy.pool.StaticPool))
446
+ kw_args = {'poolclass': sqlalchemy.NullPool}
447
+ if async_engine:
448
+ _postgres_engine_cache[conn_string] = (
449
+ sqlalchemy_async.create_async_engine(
450
+ conn_string, **kw_args))
451
+ else:
452
+ _postgres_engine_cache[conn_string] = (
453
+ sqlalchemy.create_engine(conn_string, **kw_args))
393
454
  else:
394
- _postgres_engine_cache[conn_string] = (
395
- sqlalchemy.create_engine(
396
- conn_string,
397
- poolclass=sqlalchemy.pool.QueuePool,
398
- size=_max_connections,
399
- max_overflow=0))
455
+ kw_args = {
456
+ 'pool_size': _max_connections,
457
+ 'max_overflow': max(0, 5 - _max_connections),
458
+ 'pool_pre_ping': True,
459
+ 'pool_recycle': 1800
460
+ }
461
+ if async_engine:
462
+ kw_args[
463
+ 'poolclass'] = sqlalchemy.pool.AsyncAdaptedQueuePool
464
+ _postgres_engine_cache[conn_string] = (
465
+ sqlalchemy_async.create_async_engine(
466
+ conn_string, **kw_args))
467
+ else:
468
+ kw_args['poolclass'] = sqlalchemy.pool.QueuePool
469
+ _postgres_engine_cache[conn_string] = (
470
+ sqlalchemy.create_engine(conn_string, **kw_args))
400
471
  engine = _postgres_engine_cache[conn_string]
401
472
  else:
402
- db_path = os.path.expanduser(f'~/.sky/{db_name}.db')
473
+ assert db_name is not None, 'db_name must be provided for SQLite'
474
+ db_path = runtime_utils.get_runtime_dir_path(f'.sky/{db_name}.db')
403
475
  pathlib.Path(db_path).parents[0].mkdir(parents=True, exist_ok=True)
476
+ if async_engine:
477
+ # This is an AsyncEngine, instead of a (normal, synchronous) Engine,
478
+ # so we should not put it in the cache. Instead, just return.
479
+ return sqlalchemy_async.create_async_engine(
480
+ 'sqlite+aiosqlite:///' + db_path, connect_args={'timeout': 30})
404
481
  if db_path not in _sqlite_engine_cache:
405
482
  _sqlite_engine_cache[db_path] = sqlalchemy.create_engine(
406
483
  'sqlite:///' + db_path)