skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/scheduler.py CHANGED
@@ -9,17 +9,22 @@ The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
9
9
  be called from any code running on the managed jobs controller instance to
10
10
  trigger scheduling of new jobs if possible. This function should be called
11
11
  immediately after any state change that could result in jobs newly being able to
12
- be scheduled.
12
+ be scheduled. If the job is running in a pool, the scheduler will only schedule
13
+ jobs for the same pool, because the resources limitations are per-pool (see the
14
+ following section for more details).
13
15
 
14
- The scheduling logic limits the number of running jobs according to two limits:
16
+ The scheduling logic limits #running jobs according to three limits:
15
17
  1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
16
- once, based on the number of CPUs. (See _get_launch_parallelism.) This the
17
- most compute-intensive part of the job lifecycle, which is why we have an
18
- additional limit.
18
+ once, based on the number of CPUs. This the most compute-intensive part of
19
+ the job lifecycle, which is why we have an additional limit.
20
+ See sky/utils/controller_utils.py::_get_launch_parallelism.
19
21
  2. The number of jobs that can be running at any given time, based on the amount
20
- of memory. (See _get_job_parallelism.) Since the job controller is doing very
21
- little once a job starts (just checking its status periodically), the most
22
- significant resource it consumes is memory.
22
+ of memory. Since the job controller is doing very little once a job starts
23
+ (just checking its status periodically), the most significant resource it
24
+ consumes is memory.
25
+ See sky/utils/controller_utils.py::_get_job_parallelism.
26
+ 3. The number of jobs that can be running in a pool at any given time, based on
27
+ the number of ready workers in the pool. (See _can_start_new_job.)
23
28
 
24
29
  The state of the scheduler is entirely determined by the schedule_state column
25
30
  of all the jobs in the job_info table. This column should only be modified via
@@ -37,152 +42,220 @@ Nomenclature:
37
42
  """
38
43
 
39
44
  from argparse import ArgumentParser
45
+ import asyncio
40
46
  import contextlib
41
- from functools import lru_cache
42
47
  import os
43
- import time
48
+ import pathlib
49
+ import shutil
50
+ import sys
44
51
  import typing
52
+ from typing import List, Optional, Set
53
+ import uuid
45
54
 
46
55
  import filelock
47
56
 
48
57
  from sky import sky_logging
58
+ from sky import skypilot_config
49
59
  from sky.adaptors import common as adaptors_common
60
+ from sky.client import sdk
50
61
  from sky.jobs import constants as managed_job_constants
51
62
  from sky.jobs import state
63
+ from sky.jobs import utils as managed_job_utils
52
64
  from sky.skylet import constants
53
- from sky.utils import common_utils
65
+ from sky.utils import controller_utils
54
66
  from sky.utils import subprocess_utils
55
67
 
56
68
  if typing.TYPE_CHECKING:
69
+ import logging
70
+
57
71
  import psutil
58
72
  else:
59
73
  psutil = adaptors_common.LazyImport('psutil')
60
74
 
61
75
  logger = sky_logging.init_logger('sky.jobs.controller')
62
76
 
63
- # The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
64
- # parallelism control or updating the schedule_state of any job.
65
- # Any code that takes this lock must conclude by calling
66
- # maybe_schedule_next_jobs.
67
- _MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
68
- _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
69
-
70
- # Based on testing, assume a running job uses 350MB memory.
71
- JOB_MEMORY_MB = 350
72
- # Past 2000 simultaneous jobs, we become unstable.
73
- # See https://github.com/skypilot-org/skypilot/issues/4649.
74
- MAX_JOB_LIMIT = 2000
75
- # Number of ongoing launches launches allowed per CPU.
76
- LAUNCHES_PER_CPU = 4
77
-
78
-
79
- @lru_cache(maxsize=1)
80
- def _get_lock_path() -> str:
81
- path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
82
- os.makedirs(os.path.dirname(path), exist_ok=True)
83
- return path
84
-
85
-
86
- def maybe_schedule_next_jobs() -> None:
87
- """Determine if any managed jobs can be scheduled, and if so, schedule them.
88
-
89
- Here, "schedule" means to select job that is waiting, and allow it to
90
- proceed. It does NOT mean to submit a job to the scheduler.
91
-
92
- For newly submitted jobs, scheduling means updating the state of the jobs,
93
- and starting the job controller process. For jobs that are already alive but
94
- are waiting to launch a new task or recover, just update the state of the
95
- job to indicate that the launch can proceed.
96
-
97
- This function transitions jobs into LAUNCHING on a best-effort basis. That
98
- is, if we can start any jobs, we will, but if not, we will exit (almost)
99
- immediately. It's expected that if some WAITING or ALIVE_WAITING jobs cannot
100
- be started now (either because the lock is held, or because there are not
101
- enough resources), another call to this function will be made whenever that
102
- situation is resolved. (If the lock is held, the lock holder should start
103
- the jobs. If there aren't enough resources, the next controller to exit and
104
- free up resources should start the jobs.)
105
-
106
- If this function obtains the lock, it will launch as many jobs as possible
107
- before releasing the lock. This is what allows other calls to exit
108
- immediately if the lock is held, while ensuring that all jobs are started as
109
- soon as possible.
110
-
111
- This uses subprocess_utils.launch_new_process_tree() to start the controller
112
- processes, which should be safe to call from pretty much any code running on
113
- the jobs controller instance. New job controller processes will be detached
114
- from the current process and there will not be a parent/child relationship.
115
- See launch_new_process_tree for more.
77
+ # Job controller lock. This is used to synchronize writing/reading the
78
+ # controller pid file.
79
+ JOB_CONTROLLER_PID_LOCK = os.path.expanduser(
80
+ '~/.sky/locks/job_controller_pid.lock')
81
+
82
+ JOB_CONTROLLER_PID_PATH = os.path.expanduser('~/.sky/job_controller_pid')
83
+ JOB_CONTROLLER_ENV_PATH = os.path.expanduser('~/.sky/job_controller_env')
84
+
85
+ CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
86
+
87
+
88
+ def _parse_controller_pid_entry(
89
+ entry: str) -> Optional[state.ControllerPidRecord]:
90
+ entry = entry.strip()
91
+ if not entry:
92
+ return None
93
+ # The entry should be like <pid>,<started_at>
94
+ # pid is an integer, started_at is a float
95
+ # For backwards compatibility, we also support just <pid>
96
+ entry_parts = entry.split(',')
97
+ if len(entry_parts) == 2:
98
+ [raw_pid, raw_started_at] = entry_parts
99
+ elif len(entry_parts) == 1:
100
+ # Backwards compatibility, pre-#7847
101
+ # TODO(cooperc): Remove for 0.13.0
102
+ raw_pid = entry_parts[0]
103
+ raw_started_at = None
104
+ else:
105
+ # Unknown format
106
+ return None
107
+
108
+ try:
109
+ pid = int(raw_pid)
110
+ except ValueError:
111
+ return None
112
+
113
+ started_at: Optional[float] = None
114
+ if raw_started_at:
115
+ try:
116
+ started_at = float(raw_started_at)
117
+ except ValueError:
118
+ started_at = None
119
+ return state.ControllerPidRecord(pid=pid, started_at=started_at)
120
+
121
+
122
+ def get_controller_process_records(
123
+ ) -> Optional[List[state.ControllerPidRecord]]:
124
+ """Return recorded controller processes if the file can be read."""
125
+ if not os.path.exists(JOB_CONTROLLER_PID_PATH):
126
+ # If the file doesn't exist, it means the controller server is not
127
+ # running, so we return an empty list
128
+ return []
129
+ try:
130
+ with open(JOB_CONTROLLER_PID_PATH, 'r', encoding='utf-8') as f:
131
+ lines = f.read().splitlines()
132
+ except (FileNotFoundError, OSError):
133
+ return None
134
+
135
+ records: List[state.ControllerPidRecord] = []
136
+ for line in lines:
137
+ record = _parse_controller_pid_entry(line)
138
+ if record is not None:
139
+ records.append(record)
140
+ return records
141
+
142
+
143
+ def _append_controller_pid_record(pid: int,
144
+ started_at: Optional[float]) -> None:
145
+ # Note: started_at is a float, but converting to a string will not lose any
146
+ # precision. See https://docs.python.org/3/tutorial/floatingpoint.html and
147
+ # https://github.com/python/cpython/issues/53583
148
+ entry = str(pid) if started_at is None else f'{pid},{started_at}'
149
+ with open(JOB_CONTROLLER_PID_PATH, 'a', encoding='utf-8') as f:
150
+ f.write(entry + '\n')
151
+
152
+
153
+ def start_controller() -> None:
154
+ """Start the job controller process.
155
+
156
+ This requires that the env file is already set up.
157
+ """
158
+ os.environ[constants.OVERRIDE_CONSOLIDATION_MODE] = 'true'
159
+ logs_dir = os.path.expanduser(
160
+ managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
161
+ os.makedirs(logs_dir, exist_ok=True)
162
+ controller_uuid = str(uuid.uuid4())
163
+ log_path = os.path.join(logs_dir, f'controller_{controller_uuid}.log')
164
+
165
+ activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
166
+ run_controller_cmd = (f'{sys.executable} -u -m'
167
+ f'sky.jobs.controller {controller_uuid}')
168
+
169
+ run_cmd = (f'{activate_python_env_cmd}'
170
+ f'{run_controller_cmd}')
171
+
172
+ logger.info(f'Running controller with command: {run_cmd}')
173
+
174
+ pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path)
175
+ pid_started_at = psutil.Process(pid).create_time()
176
+ _append_controller_pid_record(pid, pid_started_at)
177
+
178
+
179
+ def get_alive_controllers() -> Optional[int]:
180
+ records = get_controller_process_records()
181
+ if records is None:
182
+ # If we cannot read the file reliably, avoid starting extra controllers.
183
+ return None
184
+ if not records:
185
+ return 0
186
+
187
+ alive = 0
188
+ for record in records:
189
+ if managed_job_utils.controller_process_alive(record, quiet=False):
190
+ alive += 1
191
+ return alive
192
+
193
+
194
+ def maybe_start_controllers(from_scheduler: bool = False) -> None:
195
+ """Start the job controller process.
196
+
197
+ If the process is already running, it will not start a new one.
198
+ Will also add the job_id, dag_yaml_path, and env_file_path to the
199
+ controllers list of processes.
116
200
  """
201
+ # In consolidation mode, during rolling update, two API servers may be
202
+ # running. If we are on the new API server, and we haven't finished the
203
+ # recovery process, we should avoid starting new controllers. The old API
204
+ # server/consolidated jobs controller could run update_managed_jobs_statuses
205
+ # and if there are jobs running on the new API server, the old one will not
206
+ # see the corresponding processes and may mark them as FAILED_CONTROLLER.
207
+ if from_scheduler and managed_job_utils.is_consolidation_mode(
208
+ ) and os.path.exists(
209
+ os.path.expanduser(
210
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
211
+ # This could happen during an API server rolling update, or during
212
+ # normal running while managed-job-status-refresh-daemon is running. In
213
+ # either case, the controllers should be already started or will be
214
+ # started by the recovery process.
215
+ logger.info('Recovery is still in progress, skipping controller start.')
216
+ return
117
217
  try:
118
- # We must use a global lock rather than a per-job lock to ensure correct
119
- # parallelism control. If we cannot obtain the lock, exit immediately.
120
- # The current lock holder is expected to launch any jobs it can before
121
- # releasing the lock.
122
- with filelock.FileLock(_get_lock_path(), blocking=False):
123
- while True:
124
- maybe_next_job = state.get_waiting_job()
125
- if maybe_next_job is None:
126
- # Nothing left to start, break from scheduling loop
127
- break
128
-
129
- current_state = maybe_next_job['schedule_state']
130
-
131
- assert current_state in (
132
- state.ManagedJobScheduleState.ALIVE_WAITING,
133
- state.ManagedJobScheduleState.WAITING), maybe_next_job
134
-
135
- # Note: we expect to get ALIVE_WAITING jobs before WAITING jobs,
136
- # since they will have been submitted and therefore started
137
- # first. The requirements to launch in an alive job are more
138
- # lenient, so there is no way that we wouldn't be able to launch
139
- # an ALIVE_WAITING job, but we would be able to launch a WAITING
140
- # job.
141
- if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
142
- if not _can_lauch_in_alive_job():
143
- # Can't schedule anything, break from scheduling loop.
144
- break
145
- elif current_state == state.ManagedJobScheduleState.WAITING:
146
- if not _can_start_new_job():
147
- # Can't schedule anything, break from scheduling loop.
148
- break
149
-
150
- logger.debug(f'Scheduling job {maybe_next_job["job_id"]}')
151
- state.scheduler_set_launching(maybe_next_job['job_id'],
152
- current_state)
153
-
154
- if current_state == state.ManagedJobScheduleState.WAITING:
155
- # The job controller has not been started yet. We must start
156
- # it.
157
-
158
- job_id = maybe_next_job['job_id']
159
- dag_yaml_path = maybe_next_job['dag_yaml_path']
160
-
161
- activate_python_env_cmd = (
162
- f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
163
- env_file = maybe_next_job['env_file_path']
164
- source_environment_cmd = (f'source {env_file};'
165
- if env_file else '')
166
- run_controller_cmd = ('python -u -m sky.jobs.controller '
167
- f'{dag_yaml_path} --job-id {job_id};')
168
-
169
- # If the command line here is changed, please also update
170
- # utils._controller_process_alive. `--job-id X` should be at
171
- # the end.
172
- run_cmd = (f'{activate_python_env_cmd}'
173
- f'{source_environment_cmd}'
174
- f'{run_controller_cmd}')
175
-
176
- logs_dir = os.path.expanduser(
177
- managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
178
- os.makedirs(logs_dir, exist_ok=True)
179
- log_path = os.path.join(logs_dir, f'{job_id}.log')
180
-
181
- pid = subprocess_utils.launch_new_process_tree(
182
- run_cmd, log_output=log_path)
183
- state.set_job_controller_pid(job_id, pid)
184
-
185
- logger.debug(f'Job {job_id} started with pid {pid}')
218
+ with filelock.FileLock(JOB_CONTROLLER_PID_LOCK, blocking=False):
219
+ if from_scheduler and not managed_job_utils.is_consolidation_mode():
220
+ cur = pathlib.Path(CURRENT_HASH)
221
+ old = pathlib.Path(f'{CURRENT_HASH}.old')
222
+
223
+ if old.exists() and cur.exists():
224
+ if (old.read_text(encoding='utf-8') !=
225
+ cur.read_text(encoding='utf-8')):
226
+ # TODO(luca): there is a 1/2^160 chance that there will
227
+ # be a collision. using a geometric distribution and
228
+ # assuming one update a day, we expect a bug slightly
229
+ # before the heat death of the universe. should get
230
+ # this fixed before then.
231
+ try:
232
+ # this will stop all the controllers and the api
233
+ # server.
234
+ sdk.api_stop()
235
+ # All controllers should be dead. Remove the PIDs so
236
+ # that update_managed_jobs_statuses won't think they
237
+ # have failed.
238
+ state.reset_jobs_for_recovery()
239
+ except Exception as e: # pylint: disable=broad-except
240
+ logger.error(f'Failed to stop the api server: {e}')
241
+ pass
242
+ else:
243
+ shutil.copyfile(cur, old)
244
+ if not old.exists():
245
+ shutil.copyfile(cur, old)
246
+
247
+ alive = get_alive_controllers()
248
+ if alive is None:
249
+ return
250
+ wanted = controller_utils.get_number_of_jobs_controllers()
251
+ started = 0
252
+
253
+ while alive + started < wanted:
254
+ start_controller()
255
+ started += 1
256
+
257
+ if started > 0:
258
+ logger.info(f'Started {started} controllers')
186
259
 
187
260
  except filelock.Timeout:
188
261
  # If we can't get the lock, just exit. The process holding the lock
@@ -190,24 +263,64 @@ def maybe_schedule_next_jobs() -> None:
190
263
  pass
191
264
 
192
265
 
193
- def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
266
+ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
267
+ env_file_path: str, priority: int) -> None:
194
268
  """Submit an existing job to the scheduler.
195
269
 
196
270
  This should be called after a job is created in the `spot` table as
197
271
  PENDING. It will tell the scheduler to try and start the job controller, if
198
- there are resources available. It may block to acquire the lock, so it
199
- should not be on the critical path for `sky jobs launch -d`.
272
+ there are resources available.
200
273
 
201
274
  The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
202
275
  """
203
- with filelock.FileLock(_get_lock_path()):
204
- state.scheduler_set_waiting(job_id, dag_yaml_path, env_file_path,
205
- common_utils.get_user_hash())
206
- maybe_schedule_next_jobs()
207
-
208
-
209
- @contextlib.contextmanager
210
- def scheduled_launch(job_id: int):
276
+ controller_process = state.get_job_controller_process(job_id)
277
+ if controller_process is not None:
278
+ # why? TODO(cooperc): figure out why this is needed, fix it, and remove
279
+ if managed_job_utils.controller_process_alive(controller_process,
280
+ job_id):
281
+ # This can happen when HA recovery runs for some reason but the job
282
+ # controller is still alive.
283
+ logger.warning(f'Job {job_id} is still alive with controller '
284
+ f'{controller_process}, skipping submission')
285
+ maybe_start_controllers(from_scheduler=True)
286
+ return
287
+
288
+ with open(dag_yaml_path, 'r', encoding='utf-8') as dag_file:
289
+ dag_yaml_content = dag_file.read()
290
+ with open(original_user_yaml_path, 'r',
291
+ encoding='utf-8') as original_user_yaml_file:
292
+ original_user_yaml_content = original_user_yaml_file.read()
293
+ with open(env_file_path, 'r', encoding='utf-8') as env_file:
294
+ env_file_content = env_file.read()
295
+
296
+ # Read config file if SKYPILOT_CONFIG env var is set
297
+ config_file_content: Optional[str] = None
298
+ config_file_path = os.environ.get(skypilot_config.ENV_VAR_SKYPILOT_CONFIG)
299
+ if config_file_path:
300
+ config_file_path = os.path.expanduser(config_file_path)
301
+ if os.path.exists(config_file_path):
302
+ with open(config_file_path, 'r', encoding='utf-8') as config_file:
303
+ config_file_content = config_file.read()
304
+
305
+ config_bytes = (len(config_file_content) if config_file_content else 0)
306
+ logger.debug(f'Storing job {job_id} file contents in database '
307
+ f'(DAG bytes={len(dag_yaml_content)}, '
308
+ f'original user yaml bytes={len(original_user_yaml_content)}, '
309
+ f'env bytes={len(env_file_content)}, '
310
+ f'config bytes={config_bytes}).')
311
+ state.scheduler_set_waiting(job_id, dag_yaml_content,
312
+ original_user_yaml_content, env_file_content,
313
+ config_file_content, priority)
314
+ maybe_start_controllers(from_scheduler=True)
315
+
316
+
317
+ @contextlib.asynccontextmanager
318
+ async def scheduled_launch(
319
+ job_id: int,
320
+ starting: Set[int],
321
+ starting_lock: asyncio.Lock,
322
+ starting_signal: asyncio.Condition,
323
+ ):
211
324
  """Launch as part of an ongoing job.
212
325
 
213
326
  A newly started job will already be LAUNCHING, and this will immediately
@@ -228,23 +341,42 @@ def scheduled_launch(job_id: int):
228
341
  multiple uses of this context are nested, behavior is undefined. Don't do
229
342
  that.
230
343
  """
344
+ pool = state.get_pool_from_job_id(job_id)
345
+ # For pool, since there is no execution.launch, we don't need to have all
346
+ # the ALIVE_WAITING state. The state transition will be
347
+ # WAITING -> ALIVE -> DONE without any intermediate transitions.
348
+ if pool is not None:
349
+ yield
350
+ return
351
+
352
+ assert starting_lock == starting_signal._lock, ( # type: ignore #pylint: disable=protected-access
353
+ 'starting_lock and starting_signal must use the same lock')
231
354
 
232
- # If we're already in LAUNCHING schedule_state, we don't need to wait.
233
- # This may be the case for the first launch of a job.
234
- if (state.get_job_schedule_state(job_id) !=
235
- state.ManagedJobScheduleState.LAUNCHING):
236
- # Since we aren't LAUNCHING, we need to wait to be scheduled.
237
- _set_alive_waiting(job_id)
355
+ while True:
356
+ async with starting_lock:
357
+ starting_count = len(starting)
358
+ if starting_count < controller_utils.LAUNCHES_PER_WORKER:
359
+ break
360
+ logger.info('Too many jobs starting, waiting for a slot')
361
+ await starting_signal.wait()
238
362
 
239
- while (state.get_job_schedule_state(job_id) !=
240
- state.ManagedJobScheduleState.LAUNCHING):
241
- time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
363
+ logger.info(f'Starting job {job_id}')
242
364
 
243
- yield
365
+ async with starting_lock:
366
+ starting.add(job_id)
244
367
 
245
- with filelock.FileLock(_get_lock_path()):
246
- state.scheduler_set_alive(job_id)
247
- maybe_schedule_next_jobs()
368
+ await state.scheduler_set_launching_async(job_id)
369
+
370
+ try:
371
+ yield
372
+ except Exception as e:
373
+ raise e
374
+ else:
375
+ await state.scheduler_set_alive_async(job_id)
376
+ finally:
377
+ async with starting_lock:
378
+ starting.remove(job_id)
379
+ starting_signal.notify()
248
380
 
249
381
 
250
382
  def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -255,46 +387,23 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
255
387
 
256
388
  The job could be in any terminal ManagedJobStatus. However, once DONE, it
257
389
  should never transition back to another state.
390
+
391
+ This is only called by utils.update_managed_jobs_statuses which is sync.
258
392
  """
259
393
  if idempotent and (state.get_job_schedule_state(job_id)
260
394
  == state.ManagedJobScheduleState.DONE):
261
395
  return
262
396
 
263
- with filelock.FileLock(_get_lock_path()):
264
- state.scheduler_set_done(job_id, idempotent)
265
- maybe_schedule_next_jobs()
266
-
267
-
268
- def _set_alive_waiting(job_id: int) -> None:
269
- """Should use wait_until_launch_okay() to transition to this state."""
270
- with filelock.FileLock(_get_lock_path()):
271
- state.scheduler_set_alive_waiting(job_id)
272
- maybe_schedule_next_jobs()
273
-
397
+ state.scheduler_set_done(job_id, idempotent)
274
398
 
275
- def _get_job_parallelism() -> int:
276
- job_memory = JOB_MEMORY_MB * 1024 * 1024
277
-
278
- job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
279
-
280
- return max(job_limit, 1)
281
-
282
-
283
- def _get_launch_parallelism() -> int:
284
- cpus = os.cpu_count()
285
- return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
286
-
287
-
288
- def _can_start_new_job() -> bool:
289
- launching_jobs = state.get_num_launching_jobs()
290
- alive_jobs = state.get_num_alive_jobs()
291
- return launching_jobs < _get_launch_parallelism(
292
- ) and alive_jobs < _get_job_parallelism()
293
399
 
400
+ async def job_done_async(job_id: int, idempotent: bool = False):
401
+ """Async version of job_done."""
402
+ if idempotent and (await state.get_job_schedule_state_async(job_id)
403
+ == state.ManagedJobScheduleState.DONE):
404
+ return
294
405
 
295
- def _can_lauch_in_alive_job() -> bool:
296
- launching_jobs = state.get_num_launching_jobs()
297
- return launching_jobs < _get_launch_parallelism()
406
+ await state.scheduler_set_done_async(job_id, idempotent)
298
407
 
299
408
 
300
409
  if __name__ == '__main__':
@@ -302,6 +411,9 @@ if __name__ == '__main__':
302
411
  parser.add_argument('dag_yaml',
303
412
  type=str,
304
413
  help='The path to the user job yaml file.')
414
+ parser.add_argument('--user-yaml-path',
415
+ type=str,
416
+ help='The path to the original user job yaml file.')
305
417
  parser.add_argument('--job-id',
306
418
  required=True,
307
419
  type=int,
@@ -309,5 +421,18 @@ if __name__ == '__main__':
309
421
  parser.add_argument('--env-file',
310
422
  type=str,
311
423
  help='The path to the controller env file.')
424
+ parser.add_argument('--pool',
425
+ type=str,
426
+ required=False,
427
+ default=None,
428
+ help='The pool to use for the controller job.')
429
+ parser.add_argument(
430
+ '--priority',
431
+ type=int,
432
+ default=constants.DEFAULT_PRIORITY,
433
+ help=
434
+ f'Job priority ({constants.MIN_PRIORITY} to {constants.MAX_PRIORITY}).'
435
+ f' Default: {constants.DEFAULT_PRIORITY}.')
312
436
  args = parser.parse_args()
313
- submit_job(args.job_id, args.dag_yaml, args.env_file)
437
+ submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
438
+ args.priority)