skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py CHANGED
@@ -9,17 +9,22 @@ The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
9
9
  be called from any code running on the managed jobs controller instance to
10
10
  trigger scheduling of new jobs if possible. This function should be called
11
11
  immediately after any state change that could result in jobs newly being able to
12
- be scheduled.
12
+ be scheduled. If the job is running in a pool, the scheduler will only schedule
13
+ jobs for the same pool, because the resources limitations are per-pool (see the
14
+ following section for more details).
13
15
 
14
- The scheduling logic limits the number of running jobs according to two limits:
16
+ The scheduling logic limits #running jobs according to three limits:
15
17
  1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
16
- once, based on the number of CPUs. (See _get_launch_parallelism.) This the
17
- most compute-intensive part of the job lifecycle, which is why we have an
18
- additional limit.
18
+ once, based on the number of CPUs. This the most compute-intensive part of
19
+ the job lifecycle, which is why we have an additional limit.
20
+ See sky/utils/controller_utils.py::_get_launch_parallelism.
19
21
  2. The number of jobs that can be running at any given time, based on the amount
20
- of memory. (See _get_job_parallelism.) Since the job controller is doing very
21
- little once a job starts (just checking its status periodically), the most
22
- significant resource it consumes is memory.
22
+ of memory. Since the job controller is doing very little once a job starts
23
+ (just checking its status periodically), the most significant resource it
24
+ consumes is memory.
25
+ See sky/utils/controller_utils.py::_get_job_parallelism.
26
+ 3. The number of jobs that can be running in a pool at any given time, based on
27
+ the number of ready workers in the pool. (See _can_start_new_job.)
23
28
 
24
29
  The state of the scheduler is entirely determined by the schedule_state column
25
30
  of all the jobs in the job_info table. This column should only be modified via
@@ -37,152 +42,227 @@ Nomenclature:
37
42
  """
38
43
 
39
44
  from argparse import ArgumentParser
45
+ import asyncio
40
46
  import contextlib
41
- from functools import lru_cache
42
47
  import os
43
- import time
48
+ import pathlib
49
+ import shutil
50
+ import sys
44
51
  import typing
52
+ from typing import Set
53
+ import uuid
45
54
 
46
55
  import filelock
47
56
 
48
57
  from sky import sky_logging
58
+ from sky import skypilot_config
49
59
  from sky.adaptors import common as adaptors_common
60
+ from sky.client import sdk
50
61
  from sky.jobs import constants as managed_job_constants
51
62
  from sky.jobs import state
63
+ from sky.jobs import utils as managed_job_utils
64
+ from sky.server import config as server_config
52
65
  from sky.skylet import constants
53
- from sky.utils import common_utils
66
+ from sky.utils import annotations
67
+ from sky.utils import controller_utils
54
68
  from sky.utils import subprocess_utils
55
69
 
56
70
  if typing.TYPE_CHECKING:
71
+ import logging
72
+
57
73
  import psutil
58
74
  else:
59
75
  psutil = adaptors_common.LazyImport('psutil')
60
76
 
61
77
  logger = sky_logging.init_logger('sky.jobs.controller')
62
78
 
63
- # The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
64
- # parallelism control or updating the schedule_state of any job.
65
- # Any code that takes this lock must conclude by calling
66
- # maybe_schedule_next_jobs.
67
- _MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
68
- _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
69
-
70
- # Based on testing, assume a running job uses 350MB memory.
71
- JOB_MEMORY_MB = 350
72
- # Past 2000 simultaneous jobs, we become unstable.
73
- # See https://github.com/skypilot-org/skypilot/issues/4649.
74
- MAX_JOB_LIMIT = 2000
75
- # Number of ongoing launches launches allowed per CPU.
76
- LAUNCHES_PER_CPU = 4
77
-
78
-
79
- @lru_cache(maxsize=1)
80
- def _get_lock_path() -> str:
81
- path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
82
- os.makedirs(os.path.dirname(path), exist_ok=True)
83
- return path
84
-
85
-
86
- def maybe_schedule_next_jobs() -> None:
87
- """Determine if any managed jobs can be scheduled, and if so, schedule them.
88
-
89
- Here, "schedule" means to select job that is waiting, and allow it to
90
- proceed. It does NOT mean to submit a job to the scheduler.
91
-
92
- For newly submitted jobs, scheduling means updating the state of the jobs,
93
- and starting the job controller process. For jobs that are already alive but
94
- are waiting to launch a new task or recover, just update the state of the
95
- job to indicate that the launch can proceed.
96
-
97
- This function transitions jobs into LAUNCHING on a best-effort basis. That
98
- is, if we can start any jobs, we will, but if not, we will exit (almost)
99
- immediately. It's expected that if some WAITING or ALIVE_WAITING jobs cannot
100
- be started now (either because the lock is held, or because there are not
101
- enough resources), another call to this function will be made whenever that
102
- situation is resolved. (If the lock is held, the lock holder should start
103
- the jobs. If there aren't enough resources, the next controller to exit and
104
- free up resources should start the jobs.)
105
-
106
- If this function obtains the lock, it will launch as many jobs as possible
107
- before releasing the lock. This is what allows other calls to exit
108
- immediately if the lock is held, while ensuring that all jobs are started as
109
- soon as possible.
110
-
111
- This uses subprocess_utils.launch_new_process_tree() to start the controller
112
- processes, which should be safe to call from pretty much any code running on
113
- the jobs controller instance. New job controller processes will be detached
114
- from the current process and there will not be a parent/child relationship.
115
- See launch_new_process_tree for more.
79
+ # Job controller lock. This is used to synchronize writing/reading the
80
+ # controller pid file.
81
+ JOB_CONTROLLER_PID_LOCK = os.path.expanduser(
82
+ '~/.sky/locks/job_controller_pid.lock')
83
+
84
+ JOB_CONTROLLER_PID_PATH = os.path.expanduser('~/.sky/job_controller_pid')
85
+ JOB_CONTROLLER_ENV_PATH = os.path.expanduser('~/.sky/job_controller_env')
86
+
87
+ # Based on testing, each worker takes around 200-300MB memory. Keeping it
88
+ # higher to be safe.
89
+ JOB_MEMORY_MB = 400
90
+ # Number of ongoing launches launches allowed per worker. Can probably be
91
+ # increased a bit to around 16 but keeping it lower to just to be safe
92
+ LAUNCHES_PER_WORKER = 8
93
+ # this can probably be increased to around 300-400 but keeping it lower to just
94
+ # to be safe
95
+ MAX_JOBS_PER_WORKER = 200
96
+ # Maximum number of controllers that can be running. Hard to handle more than
97
+ # 512 launches at once.
98
+ MAX_CONTROLLERS = 512 // LAUNCHES_PER_WORKER
99
+ # Limit the number of jobs that can be running at once on the entire jobs
100
+ # controller cluster. It's hard to handle cancellation of more than 2000 jobs at
101
+ # once.
102
+ # TODO(cooperc): Once we eliminate static bottlenecks (e.g. sqlite), remove this
103
+ # hardcoded max limit.
104
+ MAX_TOTAL_RUNNING_JOBS = 2000
105
+ # Maximum values for above constants. There will start to be lagging issues
106
+ # at these numbers already.
107
+ # JOB_MEMORY_MB = 200
108
+ # LAUNCHES_PER_WORKER = 16
109
+ # JOBS_PER_WORKER = 400
110
+
111
+ # keep 2GB reserved after the controllers
112
+ MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB = 2048
113
+
114
+ CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash')
115
+
116
+
117
+ @annotations.lru_cache(scope='global')
118
+ def get_number_of_controllers() -> int:
119
+ """Returns the number of controllers that should be running.
120
+
121
+ This is the number of controllers that should be running to maximize
122
+ resource utilization.
123
+
124
+ In consolidation mode, we use the existing API server so our resource
125
+ requirements are just for the job controllers. We try taking up as much
126
+ much memory as possible left over from the API server.
127
+
128
+ In non-consolidation mode, we have to take into account the memory of the
129
+ API server workers. We limit to only 8 launches per worker, so our logic is
130
+ each controller will take CONTROLLER_MEMORY_MB + 8 * WORKER_MEMORY_MB. We
131
+ leave some leftover room for ssh codegen and ray status overhead.
132
+ """
133
+ consolidation_mode = skypilot_config.get_nested(
134
+ ('jobs', 'controller', 'consolidation_mode'), default_value=False)
135
+
136
+ total_memory_mb = controller_utils.get_controller_mem_size_gb() * 1024
137
+ if consolidation_mode:
138
+ config = server_config.compute_server_config(deploy=True, quiet=True)
139
+
140
+ used = 0.0
141
+ used += MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB
142
+ used += (config.long_worker_config.garanteed_parallelism +
143
+ config.long_worker_config.burstable_parallelism) * \
144
+ server_config.LONG_WORKER_MEM_GB * 1024
145
+ used += (config.short_worker_config.garanteed_parallelism +
146
+ config.short_worker_config.burstable_parallelism) * \
147
+ server_config.SHORT_WORKER_MEM_GB * 1024
148
+
149
+ return min(MAX_CONTROLLERS,
150
+ max(1, int((total_memory_mb - used) // JOB_MEMORY_MB)))
151
+ else:
152
+ return min(
153
+ MAX_CONTROLLERS,
154
+ max(
155
+ 1,
156
+ int((total_memory_mb - MAXIMUM_CONTROLLER_RESERVED_MEMORY_MB) /
157
+ ((LAUNCHES_PER_WORKER * server_config.LONG_WORKER_MEM_GB) *
158
+ 1024 + JOB_MEMORY_MB))))
159
+
160
+
161
+ def start_controller() -> None:
162
+ """Start the job controller process.
163
+
164
+ This requires that the env file is already set up.
165
+ """
166
+ os.environ[constants.OVERRIDE_CONSOLIDATION_MODE] = 'true'
167
+ logs_dir = os.path.expanduser(
168
+ managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
169
+ os.makedirs(logs_dir, exist_ok=True)
170
+ controller_uuid = str(uuid.uuid4())
171
+ log_path = os.path.join(logs_dir, f'controller_{controller_uuid}.log')
172
+
173
+ activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
174
+ run_controller_cmd = (f'{sys.executable} -u -m'
175
+ f'sky.jobs.controller {controller_uuid}')
176
+
177
+ run_cmd = (f'{activate_python_env_cmd}'
178
+ f'{run_controller_cmd}')
179
+
180
+ logger.info(f'Running controller with command: {run_cmd}')
181
+
182
+ pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path)
183
+ with open(JOB_CONTROLLER_PID_PATH, 'a', encoding='utf-8') as f:
184
+ f.write(str(pid) + '\n')
185
+
186
+
187
+ def get_alive_controllers() -> typing.Optional[int]:
188
+ if not os.path.exists(JOB_CONTROLLER_PID_PATH):
189
+ # if the file doesn't exist, it means the controller server is not
190
+ # running, so we return 0
191
+ return 0
192
+
193
+ try:
194
+ with open(JOB_CONTROLLER_PID_PATH, 'r', encoding='utf-8') as f:
195
+ pids = f.read().split('\n')[:-1]
196
+ except OSError:
197
+ # if the file is corrupted, or any issues with reading it, we just
198
+ # return None to be safe and not over start
199
+ return None
200
+
201
+ alive = 0
202
+ for pid in pids:
203
+ try:
204
+ # TODO(luca) there is a chance that the process that is alive is
205
+ # not the same controller process. a better solution is to also
206
+ # include a random UUID with each controller and store that in the
207
+ # db as well/in the command that spawns it.
208
+ if subprocess_utils.is_process_alive(int(pid.strip())):
209
+ alive += 1
210
+ except ValueError:
211
+ # if the pid is not an integer, let's assume it's alive to not
212
+ # over start new processes
213
+ alive += 1
214
+ return alive
215
+
216
+
217
+ def maybe_start_controllers(from_scheduler: bool = False) -> None:
218
+ """Start the job controller process.
219
+
220
+ If the process is already running, it will not start a new one.
221
+ Will also add the job_id, dag_yaml_path, and env_file_path to the
222
+ controllers list of processes.
116
223
  """
117
224
  try:
118
- # We must use a global lock rather than a per-job lock to ensure correct
119
- # parallelism control. If we cannot obtain the lock, exit immediately.
120
- # The current lock holder is expected to launch any jobs it can before
121
- # releasing the lock.
122
- with filelock.FileLock(_get_lock_path(), blocking=False):
123
- while True:
124
- maybe_next_job = state.get_waiting_job()
125
- if maybe_next_job is None:
126
- # Nothing left to start, break from scheduling loop
127
- break
128
-
129
- current_state = maybe_next_job['schedule_state']
130
-
131
- assert current_state in (
132
- state.ManagedJobScheduleState.ALIVE_WAITING,
133
- state.ManagedJobScheduleState.WAITING), maybe_next_job
134
-
135
- # Note: we expect to get ALIVE_WAITING jobs before WAITING jobs,
136
- # since they will have been submitted and therefore started
137
- # first. The requirements to launch in an alive job are more
138
- # lenient, so there is no way that we wouldn't be able to launch
139
- # an ALIVE_WAITING job, but we would be able to launch a WAITING
140
- # job.
141
- if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
142
- if not _can_lauch_in_alive_job():
143
- # Can't schedule anything, break from scheduling loop.
144
- break
145
- elif current_state == state.ManagedJobScheduleState.WAITING:
146
- if not _can_start_new_job():
147
- # Can't schedule anything, break from scheduling loop.
148
- break
149
-
150
- logger.debug(f'Scheduling job {maybe_next_job["job_id"]}')
151
- state.scheduler_set_launching(maybe_next_job['job_id'],
152
- current_state)
153
-
154
- if current_state == state.ManagedJobScheduleState.WAITING:
155
- # The job controller has not been started yet. We must start
156
- # it.
157
-
158
- job_id = maybe_next_job['job_id']
159
- dag_yaml_path = maybe_next_job['dag_yaml_path']
160
-
161
- activate_python_env_cmd = (
162
- f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
163
- env_file = maybe_next_job['env_file_path']
164
- source_environment_cmd = (f'source {env_file};'
165
- if env_file else '')
166
- run_controller_cmd = ('python -u -m sky.jobs.controller '
167
- f'{dag_yaml_path} --job-id {job_id};')
168
-
169
- # If the command line here is changed, please also update
170
- # utils._controller_process_alive. `--job-id X` should be at
171
- # the end.
172
- run_cmd = (f'{activate_python_env_cmd}'
173
- f'{source_environment_cmd}'
174
- f'{run_controller_cmd}')
175
-
176
- logs_dir = os.path.expanduser(
177
- managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
178
- os.makedirs(logs_dir, exist_ok=True)
179
- log_path = os.path.join(logs_dir, f'{job_id}.log')
180
-
181
- pid = subprocess_utils.launch_new_process_tree(
182
- run_cmd, log_output=log_path)
183
- state.set_job_controller_pid(job_id, pid)
184
-
185
- logger.debug(f'Job {job_id} started with pid {pid}')
225
+ with filelock.FileLock(JOB_CONTROLLER_PID_LOCK, blocking=False):
226
+ if from_scheduler and not managed_job_utils.is_consolidation_mode():
227
+ cur = pathlib.Path(CURRENT_HASH)
228
+ old = pathlib.Path(f'{CURRENT_HASH}.old')
229
+
230
+ if old.exists() and cur.exists():
231
+ if (old.read_text(encoding='utf-8') !=
232
+ cur.read_text(encoding='utf-8')):
233
+ # TODO(luca): there is a 1/2^160 chance that there will
234
+ # be a collision. using a geometric distribution and
235
+ # assuming one update a day, we expect a bug slightly
236
+ # before the heat death of the universe. should get
237
+ # this fixed before then.
238
+ try:
239
+ # this will stop all the controllers and the api
240
+ # server.
241
+ sdk.api_stop()
242
+ # All controllers should be dead. Remove the PIDs so
243
+ # that update_managed_jobs_statuses won't think they
244
+ # have failed.
245
+ state.reset_jobs_for_recovery()
246
+ except Exception as e: # pylint: disable=broad-except
247
+ logger.error(f'Failed to stop the api server: {e}')
248
+ pass
249
+ else:
250
+ shutil.copyfile(cur, old)
251
+ if not old.exists():
252
+ shutil.copyfile(cur, old)
253
+
254
+ alive = get_alive_controllers()
255
+ if alive is None:
256
+ return
257
+ wanted = get_number_of_controllers()
258
+ started = 0
259
+
260
+ while alive + started < wanted:
261
+ start_controller()
262
+ started += 1
263
+
264
+ if started > 0:
265
+ logger.info(f'Started {started} controllers')
186
266
 
187
267
  except filelock.Timeout:
188
268
  # If we can't get the lock, just exit. The process holding the lock
@@ -190,24 +270,58 @@ def maybe_schedule_next_jobs() -> None:
190
270
  pass
191
271
 
192
272
 
193
- def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
273
+ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
274
+ env_file_path: str, priority: int) -> None:
194
275
  """Submit an existing job to the scheduler.
195
276
 
196
277
  This should be called after a job is created in the `spot` table as
197
278
  PENDING. It will tell the scheduler to try and start the job controller, if
198
- there are resources available. It may block to acquire the lock, so it
199
- should not be on the critical path for `sky jobs launch -d`.
279
+ there are resources available.
200
280
 
201
281
  The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
202
282
  """
203
- with filelock.FileLock(_get_lock_path()):
204
- state.scheduler_set_waiting(job_id, dag_yaml_path, env_file_path,
205
- common_utils.get_user_hash())
206
- maybe_schedule_next_jobs()
207
-
208
-
209
- @contextlib.contextmanager
210
- def scheduled_launch(job_id: int):
283
+ controller_pid = state.get_job_controller_pid(job_id)
284
+ if controller_pid is not None:
285
+ # why? TODO(cooperc): figure out why this is needed, fix it, and remove
286
+ if managed_job_utils.controller_process_alive(controller_pid, job_id):
287
+ # This can happen when HA recovery runs for some reason but the job
288
+ # controller is still alive.
289
+ logger.warning(f'Job {job_id} is still alive, skipping submission')
290
+ maybe_start_controllers(from_scheduler=True)
291
+ return
292
+
293
+ with open(dag_yaml_path, 'r', encoding='utf-8') as dag_file:
294
+ dag_yaml_content = dag_file.read()
295
+ with open(original_user_yaml_path, 'r',
296
+ encoding='utf-8') as original_user_yaml_file:
297
+ original_user_yaml_content = original_user_yaml_file.read()
298
+ with open(env_file_path, 'r', encoding='utf-8') as env_file:
299
+ env_file_content = env_file.read()
300
+ logger.debug(f'Storing job {job_id} file contents in database '
301
+ f'(DAG bytes={len(dag_yaml_content)}, '
302
+ f'original user yaml bytes={len(original_user_yaml_content)}, '
303
+ f'env bytes={len(env_file_content)}).')
304
+ state.scheduler_set_waiting(job_id, dag_yaml_content,
305
+ original_user_yaml_content, env_file_content,
306
+ priority)
307
+ if state.get_ha_recovery_script(job_id) is None:
308
+ # the run command is just the command that called scheduler
309
+ run = (f'source {env_file_path} && '
310
+ f'{sys.executable} -m sky.jobs.scheduler {dag_yaml_path} '
311
+ f'--job-id {job_id} --env-file {env_file_path} '
312
+ f'--user-yaml-path {original_user_yaml_path} '
313
+ f'--priority {priority}')
314
+ state.set_ha_recovery_script(job_id, run)
315
+ maybe_start_controllers(from_scheduler=True)
316
+
317
+
318
+ @contextlib.asynccontextmanager
319
+ async def scheduled_launch(
320
+ job_id: int,
321
+ starting: Set[int],
322
+ starting_lock: asyncio.Lock,
323
+ starting_signal: asyncio.Condition,
324
+ ):
211
325
  """Launch as part of an ongoing job.
212
326
 
213
327
  A newly started job will already be LAUNCHING, and this will immediately
@@ -228,23 +342,42 @@ def scheduled_launch(job_id: int):
228
342
  multiple uses of this context are nested, behavior is undefined. Don't do
229
343
  that.
230
344
  """
345
+ pool = state.get_pool_from_job_id(job_id)
346
+ # For pool, since there is no execution.launch, we don't need to have all
347
+ # the ALIVE_WAITING state. The state transition will be
348
+ # WAITING -> ALIVE -> DONE without any intermediate transitions.
349
+ if pool is not None:
350
+ yield
351
+ return
231
352
 
232
- # If we're already in LAUNCHING schedule_state, we don't need to wait.
233
- # This may be the case for the first launch of a job.
234
- if (state.get_job_schedule_state(job_id) !=
235
- state.ManagedJobScheduleState.LAUNCHING):
236
- # Since we aren't LAUNCHING, we need to wait to be scheduled.
237
- _set_alive_waiting(job_id)
353
+ assert starting_lock == starting_signal._lock, ( # type: ignore #pylint: disable=protected-access
354
+ 'starting_lock and starting_signal must use the same lock')
238
355
 
239
- while (state.get_job_schedule_state(job_id) !=
240
- state.ManagedJobScheduleState.LAUNCHING):
241
- time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
356
+ while True:
357
+ async with starting_lock:
358
+ starting_count = len(starting)
359
+ if starting_count < LAUNCHES_PER_WORKER:
360
+ break
361
+ logger.info('Too many jobs starting, waiting for a slot')
362
+ await starting_signal.wait()
242
363
 
243
- yield
364
+ logger.info(f'Starting job {job_id}')
244
365
 
245
- with filelock.FileLock(_get_lock_path()):
246
- state.scheduler_set_alive(job_id)
247
- maybe_schedule_next_jobs()
366
+ async with starting_lock:
367
+ starting.add(job_id)
368
+
369
+ await state.scheduler_set_launching_async(job_id)
370
+
371
+ try:
372
+ yield
373
+ except Exception as e:
374
+ raise e
375
+ else:
376
+ await state.scheduler_set_alive_async(job_id)
377
+ finally:
378
+ async with starting_lock:
379
+ starting.remove(job_id)
380
+ starting_signal.notify()
248
381
 
249
382
 
250
383
  def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -255,46 +388,23 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
255
388
 
256
389
  The job could be in any terminal ManagedJobStatus. However, once DONE, it
257
390
  should never transition back to another state.
391
+
392
+ This is only called by utils.update_managed_jobs_statuses which is sync.
258
393
  """
259
394
  if idempotent and (state.get_job_schedule_state(job_id)
260
395
  == state.ManagedJobScheduleState.DONE):
261
396
  return
262
397
 
263
- with filelock.FileLock(_get_lock_path()):
264
- state.scheduler_set_done(job_id, idempotent)
265
- maybe_schedule_next_jobs()
266
-
267
-
268
- def _set_alive_waiting(job_id: int) -> None:
269
- """Should use wait_until_launch_okay() to transition to this state."""
270
- with filelock.FileLock(_get_lock_path()):
271
- state.scheduler_set_alive_waiting(job_id)
272
- maybe_schedule_next_jobs()
273
-
274
-
275
- def _get_job_parallelism() -> int:
276
- job_memory = JOB_MEMORY_MB * 1024 * 1024
277
-
278
- job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
279
-
280
- return max(job_limit, 1)
398
+ state.scheduler_set_done(job_id, idempotent)
281
399
 
282
400
 
283
- def _get_launch_parallelism() -> int:
284
- cpus = os.cpu_count()
285
- return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
286
-
287
-
288
- def _can_start_new_job() -> bool:
289
- launching_jobs = state.get_num_launching_jobs()
290
- alive_jobs = state.get_num_alive_jobs()
291
- return launching_jobs < _get_launch_parallelism(
292
- ) and alive_jobs < _get_job_parallelism()
293
-
401
+ async def job_done_async(job_id: int, idempotent: bool = False):
402
+ """Async version of job_done."""
403
+ if idempotent and (await state.get_job_schedule_state_async(job_id)
404
+ == state.ManagedJobScheduleState.DONE):
405
+ return
294
406
 
295
- def _can_lauch_in_alive_job() -> bool:
296
- launching_jobs = state.get_num_launching_jobs()
297
- return launching_jobs < _get_launch_parallelism()
407
+ await state.scheduler_set_done_async(job_id, idempotent)
298
408
 
299
409
 
300
410
  if __name__ == '__main__':
@@ -302,6 +412,9 @@ if __name__ == '__main__':
302
412
  parser.add_argument('dag_yaml',
303
413
  type=str,
304
414
  help='The path to the user job yaml file.')
415
+ parser.add_argument('--user-yaml-path',
416
+ type=str,
417
+ help='The path to the original user job yaml file.')
305
418
  parser.add_argument('--job-id',
306
419
  required=True,
307
420
  type=int,
@@ -309,5 +422,18 @@ if __name__ == '__main__':
309
422
  parser.add_argument('--env-file',
310
423
  type=str,
311
424
  help='The path to the controller env file.')
425
+ parser.add_argument('--pool',
426
+ type=str,
427
+ required=False,
428
+ default=None,
429
+ help='The pool to use for the controller job.')
430
+ parser.add_argument(
431
+ '--priority',
432
+ type=int,
433
+ default=constants.DEFAULT_PRIORITY,
434
+ help=
435
+ f'Job priority ({constants.MIN_PRIORITY} to {constants.MAX_PRIORITY}).'
436
+ f' Default: {constants.DEFAULT_PRIORITY}.')
312
437
  args = parser.parse_args()
313
- submit_job(args.job_id, args.dag_yaml, args.env_file)
438
+ submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
439
+ args.priority)