skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -168,7 +168,7 @@ def build_dockerimage(task: task_mod.Task,
168
168
  build_dir=temp_dir)
169
169
 
170
170
  dst = os.path.join(temp_dir, SKY_DOCKER_WORKDIR)
171
- if task.workdir is not None:
171
+ if task.workdir is not None and isinstance(task.workdir, str):
172
172
  # Copy workdir contents to tempdir
173
173
  shutil.copytree(os.path.expanduser(task.workdir), dst)
174
174
  else:
@@ -178,7 +178,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
178
178
  return handle, False
179
179
 
180
180
  def _sync_workdir(self, handle: LocalDockerResourceHandle,
181
- workdir: Path) -> None:
181
+ workdir: Union[Path, Dict[str, Any]],
182
+ envs_and_secrets: Dict[str, str]) -> None:
182
183
  """Workdir is sync'd by adding to the docker image.
183
184
 
184
185
  This happens in the execute step.
@@ -188,6 +189,15 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
188
189
  ' a NoOp. If you are running sky exec, your workdir has not'
189
190
  ' been updated.')
190
191
 
192
+ def _download_file(self, handle: LocalDockerResourceHandle,
193
+ local_file_path: str, remote_file_path: str) -> None:
194
+ """Syncs file from remote to local."""
195
+ # Copy from docker container to local
196
+ container = self.containers[handle]
197
+ copy_cmd = (
198
+ f'docker cp {container.name}:{remote_file_path} {local_file_path}')
199
+ subprocess.run(copy_cmd, shell=True, check=True)
200
+
191
201
  def _sync_file_mounts(
192
202
  self,
193
203
  handle: LocalDockerResourceHandle,
@@ -273,13 +283,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
273
283
  def _execute(self,
274
284
  handle: LocalDockerResourceHandle,
275
285
  task: 'task_lib.Task',
276
- detach_run: bool,
277
286
  dryrun: bool = False) -> None:
278
287
  """ Launches the container."""
279
- if detach_run:
280
- raise NotImplementedError('detach_run=True is not supported in '
281
- 'LocalDockerBackend.')
282
-
283
288
  if task.num_nodes > 1:
284
289
  raise NotImplementedError(
285
290
  'Tasks with num_nodes > 1 is currently not supported in '
@@ -0,0 +1,633 @@
1
+ """Code generator for task execution."""
2
+
3
+ import copy
4
+ import inspect
5
+ import json
6
+ import math
7
+ import textwrap
8
+ from typing import Dict, List, Optional, Tuple
9
+
10
+ import colorama
11
+
12
+ from sky import sky_logging
13
+ from sky.skylet import constants
14
+ from sky.skylet import log_lib
15
+ from sky.utils import accelerator_registry
16
+ from sky.utils import ux_utils
17
+
18
+ # Unset RAY_RAYLET_PID to prevent the Ray cluster in the SkyPilot runtime
19
+ # from interfering with the Ray cluster in the user's task (if any).
20
+ UNSET_RAY_ENV_VARS = ['RAY_RAYLET_PID']
21
+
22
+ logger = sky_logging.init_logger(__name__)
23
+
24
+
25
+ class TaskCodeGen:
26
+ """Base code generator for task execution on Ray and Slurm."""
27
+
28
+ def __init__(self) -> None:
29
+ # Code generated so far, to be joined via '\n'.
30
+ self._code: List[str] = []
31
+ # Guard method calling order.
32
+ self._has_prologue: bool = False
33
+ self._has_epilogue: bool = False
34
+ self._has_setup: bool = False
35
+ # Job ID is used to identify the job (also this generated code).
36
+ self.job_id: Optional[int] = None
37
+
38
+ def _add_common_imports(self) -> None:
39
+ """Add common imports for both Ray and Slurm execution."""
40
+ self._code.append(
41
+ textwrap.dedent("""\
42
+ import functools
43
+ import getpass
44
+ import hashlib
45
+ import io
46
+ import os
47
+ import pathlib
48
+ import selectors
49
+ import shlex
50
+ import subprocess
51
+ import sys
52
+ import tempfile
53
+ import textwrap
54
+ import time
55
+ from typing import Dict, List, Optional, Tuple, Union
56
+ """))
57
+
58
+ def _add_skylet_imports(self) -> None:
59
+ """Add SkyPilot skylet imports."""
60
+ self._code.append(
61
+ textwrap.dedent("""\
62
+ from sky.skylet import autostop_lib
63
+ from sky.skylet import constants
64
+ from sky.skylet import job_lib
65
+ from sky.utils import log_utils
66
+ from sky.utils import subprocess_utils
67
+ """))
68
+
69
+ def _add_logging_functions(self) -> None:
70
+ """Add log streaming functions from log_lib."""
71
+ self._code += [
72
+ # FIXME: This is a hack to make sure that the functions can be found
73
+ # by ray.remote. This should be removed once we have a better way to
74
+ # specify dependencies for ray.
75
+ inspect.getsource(log_lib._ProcessingArgs), # pylint: disable=protected-access
76
+ inspect.getsource(log_lib._get_context), # pylint: disable=protected-access
77
+ inspect.getsource(log_lib._handle_io_stream), # pylint: disable=protected-access
78
+ inspect.getsource(log_lib.process_subprocess_stream),
79
+ inspect.getsource(log_lib.run_with_log),
80
+ inspect.getsource(log_lib.make_task_bash_script),
81
+ inspect.getsource(log_lib.add_ray_env_vars),
82
+ inspect.getsource(log_lib.run_bash_command_with_log),
83
+ inspect.getsource(log_lib.run_bash_command_with_log_and_return_pid),
84
+ ]
85
+
86
+ def _add_waiting_for_resources_msg(self, num_nodes: int) -> None:
87
+ self._code.append(
88
+ textwrap.dedent(f"""\
89
+ plural = 's' if {num_nodes} > 1 else ''
90
+ node_str = f'{num_nodes} node{{plural}}'
91
+ message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
92
+ 'Waiting for task resources on '
93
+ f'{{node_str}}.{colorama.Style.RESET_ALL}')
94
+ print(message, flush=True)"""))
95
+
96
+ def _get_job_started_msg(self) -> str:
97
+ """Returns the 'Job started' streaming message with ANSI formatting."""
98
+ return (
99
+ f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
100
+ f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
101
+ f'be killed){colorama.Style.RESET_ALL}')
102
+
103
+ def _add_job_started_msg(self) -> None:
104
+ streaming_message = self._get_job_started_msg()
105
+ self._code.append(f'print({streaming_message!r}, flush=True)')
106
+
107
+ def _get_accelerator_details(
108
+ self,
109
+ resources_dict: Dict[str, float],
110
+ ) -> Tuple[Optional[str], float]:
111
+ resources_copy = resources_dict.copy()
112
+ resources_copy.pop('CPU', None)
113
+
114
+ if not resources_copy:
115
+ return None, 0.0
116
+
117
+ assert len(resources_copy) == 1, (
118
+ 'There can only be one type of accelerator per instance. '
119
+ f'Found: {resources_copy}.')
120
+
121
+ acc_name, acc_count = list(resources_copy.items())[0]
122
+ return acc_name, float(acc_count)
123
+
124
+ def _add_constants(self) -> None:
125
+ self._code.append(
126
+ textwrap.dedent(f"""\
127
+ SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
128
+
129
+ CANCELLED_RETURN_CODE = 137
130
+ """))
131
+
132
+ def _get_rclone_flush_script(self) -> str:
133
+ """Generate rclone flush script for cached storage mounts.
134
+
135
+ This script blocks job completion until all storage mounted with
136
+ CACHED_MOUNT mode is uploaded to remote.
137
+
138
+ Returns:
139
+ Bash script as string
140
+ """
141
+ return textwrap.dedent(f"""\
142
+
143
+ # Only waits if cached mount is enabled (RCLONE_MOUNT_CACHED_LOG_DIR is not empty)
144
+ # findmnt alone is not enough, as some clouds (e.g. AWS on ARM64) uses
145
+ # rclone for normal mounts as well.
146
+ if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
147
+ [ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
148
+ [ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
149
+ flushed=0
150
+ # extra second on top of --vfs-cache-poll-interval to
151
+ # avoid race condition between rclone log line creation and this check.
152
+ sleep 1
153
+ while [ $flushed -eq 0 ]; do
154
+ # sleep for the same interval as --vfs-cache-poll-interval
155
+ sleep {constants.RCLONE_CACHE_REFRESH_INTERVAL}
156
+ flushed=1
157
+ for file in {constants.RCLONE_MOUNT_CACHED_LOG_DIR}/*; do
158
+ exitcode=0
159
+ tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
160
+ if [ $exitcode -ne 0 ]; then
161
+ echo "skypilot: cached mount is still uploading to remote"
162
+ flushed=0
163
+ break
164
+ fi
165
+ done
166
+ done
167
+ echo "skypilot: cached mount uploaded complete"
168
+ fi""")
169
+
170
+ def add_prologue(self, job_id: int) -> None:
171
+ """Initialize code generator and add prologue code.
172
+
173
+ Args:
174
+ job_id: SkyPilot internal job ID
175
+ """
176
+ raise NotImplementedError
177
+
178
+ def add_setup(
179
+ self,
180
+ num_nodes: int,
181
+ resources_dict: Dict[str, float],
182
+ stable_cluster_internal_ips: List[str],
183
+ env_vars: Dict[str, str],
184
+ setup_cmd: Optional[str] = None,
185
+ setup_log_path: Optional[str] = None,
186
+ ) -> None:
187
+ """Generates code to set up the task on each node.
188
+
189
+ stable_cluster_internal_ips is used to ensure that the
190
+ SKYPILOT_NODE_RANK environment variable is assigned in a
191
+ deterministic order whenever a new task is added.
192
+ """
193
+ raise NotImplementedError
194
+
195
+ def add_task(
196
+ self,
197
+ num_nodes: int,
198
+ bash_script: Optional[str],
199
+ task_name: Optional[str],
200
+ resources_dict: Dict[str, float],
201
+ log_dir: str,
202
+ env_vars: Optional[Dict[str, str]] = None,
203
+ ) -> None:
204
+ """Generates code to run the bash command on all num_nodes nodes."""
205
+ raise NotImplementedError
206
+
207
+ def add_epilogue(self) -> None:
208
+ """Generate code that checks return codes and updates job status."""
209
+ assert self._has_prologue, 'Call add_prologue() before add_epilogue().'
210
+ assert not self._has_epilogue, 'add_epilogue() called twice?'
211
+ self._has_epilogue = True
212
+
213
+ self._code += [
214
+ textwrap.dedent(f"""\
215
+ if sum(returncodes) != 0:
216
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
217
+ # Schedule the next pending job immediately to make the job
218
+ # scheduling more efficient.
219
+ job_lib.scheduler.schedule_step()
220
+ # This waits for all streaming logs to finish.
221
+ time.sleep(0.5)
222
+ reason = ''
223
+ # 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
224
+ if any(r == 139 for r in returncodes):
225
+ reason = '(likely due to Segmentation Fault)'
226
+ if any(r == 137 for r in returncodes):
227
+ # Find the first non-137 return code
228
+ non_137 = next(r for r in returncodes if r != 137)
229
+ reason = f'(A Worker failed with return code {{non_137}}, SkyPilot cleaned up the processes on other nodes with return code 137)'
230
+ print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with '
231
+ 'return code list:{colorama.Style.RESET_ALL}',
232
+ returncodes,
233
+ reason,
234
+ flush=True)
235
+ # Need this to set the job status in ray job to be FAILED.
236
+ sys.exit(1)
237
+ else:
238
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SUCCEEDED)
239
+ # Schedule the next pending job immediately to make the job
240
+ # scheduling more efficient.
241
+ job_lib.scheduler.schedule_step()
242
+ # This waits for all streaming logs to finish.
243
+ time.sleep(0.5)
244
+ """)
245
+ ]
246
+
247
+ def build(self) -> str:
248
+ """Returns the entire generated program."""
249
+ assert self._has_epilogue, 'Call add_epilogue() before build().'
250
+ return '\n'.join(self._code)
251
+
252
+
253
+ class RayCodeGen(TaskCodeGen):
254
+ """Code generator of a Ray program that executes a sky.Task.
255
+
256
+ Usage:
257
+
258
+ >> codegen = RayCodegen()
259
+ >> codegen.add_prologue()
260
+
261
+ >> codegen.add_task(...)
262
+ >> codegen.add_task(...)
263
+
264
+ >> codegen.add_epilogue()
265
+ >> code = codegen.build()
266
+ """
267
+
268
+ def add_prologue(self, job_id: int) -> None:
269
+ assert not self._has_prologue, 'add_prologue() called twice?'
270
+ self._has_prologue = True
271
+ self.job_id = job_id
272
+ # Should use 'auto' or 'ray://<internal_head_ip>:10001' rather than
273
+ # 'ray://localhost:10001', or 'ray://127.0.0.1:10001', for public cloud.
274
+ # Otherwise, ray will fail to get the placement group because of a bug
275
+ # in ray job.
276
+ ray_address = 'auto'
277
+
278
+ # Add common imports
279
+ self._add_common_imports()
280
+
281
+ # Add Ray-specific setup
282
+ self._code.append(
283
+ textwrap.dedent("""\
284
+ # Set the environment variables to avoid deduplicating logs and
285
+ # scheduler events. This should be set in driver code, since we are
286
+ # not using `ray job submit` anymore, and the environment variables
287
+ # from the ray cluster is not inherited.
288
+ os.environ['RAY_DEDUP_LOGS'] = '0'
289
+ os.environ['RAY_SCHEDULER_EVENTS'] = '0'
290
+
291
+ import ray
292
+ import ray.util as ray_util
293
+ """))
294
+
295
+ self._add_skylet_imports()
296
+
297
+ self._add_constants()
298
+
299
+ # Add Ray configuration
300
+ self._code.append(
301
+ textwrap.dedent(f"""\
302
+ kwargs = dict()
303
+ # Only set the `_temp_dir` to SkyPilot's ray cluster directory when
304
+ # the directory exists for backward compatibility for the VM
305
+ # launched before #1790.
306
+ if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
307
+ kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
308
+ ray.init(
309
+ address={ray_address!r},
310
+ namespace='__sky__{job_id}__',
311
+ log_to_driver=True,
312
+ **kwargs
313
+ )
314
+ def get_or_fail(futures, pg) -> List[int]:
315
+ \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
316
+ if not futures:
317
+ return [], []
318
+ returncodes = [1] * len(futures)
319
+ pids = [None] * len(futures)
320
+ failed = False
321
+ # Wait for 1 task to be ready.
322
+ ready = []
323
+ # Keep invoking ray.wait if ready is empty. This is because
324
+ # ray.wait with timeout=None will only wait for 10**6 seconds,
325
+ # which will cause tasks running for more than 12 days to return
326
+ # before becoming ready.
327
+ # (Such tasks are common in serving jobs.)
328
+ # Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
329
+
330
+ def handle_ready_tasks(tasks: List[ray.ObjectRef]) -> None:
331
+ nonlocal returncodes, pids, failed
332
+ for task in tasks:
333
+ idx = futures.index(task)
334
+ res = ray.get(task)
335
+ returncodes[idx] = res['return_code']
336
+ pids[idx] = res['pid']
337
+ if res['return_code'] != 0:
338
+ failed = True
339
+
340
+ while not ready:
341
+ ready, unready = ray.wait(futures)
342
+ handle_ready_tasks(ready)
343
+ while unready:
344
+ if failed:
345
+ for task in unready:
346
+ # ray.cancel without force fails to kill tasks.
347
+ # We use force=True to kill unready tasks.
348
+ ray.cancel(task, force=True)
349
+ # Use SIGKILL=128+9 to indicate the task is forcely
350
+ # killed.
351
+ idx = futures.index(task)
352
+ returncodes[idx] = CANCELLED_RETURN_CODE
353
+ break
354
+ ready, unready = ray.wait(unready)
355
+ handle_ready_tasks(ready)
356
+ # Remove the placement group after all tasks are done, so that
357
+ # the next job can be scheduled on the released resources
358
+ # immediately.
359
+ ray_util.remove_placement_group(pg)
360
+ sys.stdout.flush()
361
+ return returncodes, pids
362
+
363
+ futures = []
364
+ """))
365
+
366
+ self._add_logging_functions()
367
+
368
+ self._code += [
369
+ 'run_bash_command_with_log = run_bash_command_with_log',
370
+ 'run_bash_command_with_log_and_return_pid = \
371
+ ray.remote(run_bash_command_with_log_and_return_pid)',
372
+ 'autostop_lib.set_last_active_time_to_now()',
373
+ f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
374
+ ]
375
+
376
+ def add_setup(
377
+ self,
378
+ num_nodes: int,
379
+ resources_dict: Dict[str, float],
380
+ stable_cluster_internal_ips: List[str],
381
+ env_vars: Dict[str, str],
382
+ setup_cmd: Optional[str] = None,
383
+ setup_log_path: Optional[str] = None,
384
+ ) -> None:
385
+ assert self._has_prologue, ('Call add_prologue() before '
386
+ 'add_setup().')
387
+ self._has_setup = True
388
+
389
+ bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
390
+ # Set CPU to avoid ray hanging the resources allocation
391
+ # for remote functions, since the task will request 1 CPU
392
+ # by default.
393
+ task_cpu_demand = resources_dict.pop('CPU')
394
+
395
+ if resources_dict:
396
+ assert len(resources_dict) == 1, (
397
+ 'There can only be one type of accelerator per instance. '
398
+ f'Found: {resources_dict}.')
399
+ acc_name, acc_count = list(resources_dict.items())[0]
400
+ gpu_dict = {'GPU': acc_count}
401
+ # gpu_dict should be empty when the accelerator is not GPU.
402
+ # TODO(zongheng,zhanghao): an alternative is to start the remote
403
+ # cluster with custom resource 'GPU': <n> even if the accelerator(s)
404
+ # are not GPU. We opt for the current solution for now.
405
+ if accelerator_registry.is_schedulable_non_gpu_accelerator(
406
+ acc_name):
407
+ gpu_dict = {}
408
+ for bundle in bundles:
409
+ bundle.update({
410
+ # Set the GPU to avoid ray hanging the resources allocation
411
+ **gpu_dict,
412
+ })
413
+
414
+ self._code.append(
415
+ f'pg = ray_util.placement_group({json.dumps(bundles)}, '
416
+ f'\'STRICT_SPREAD\')')
417
+ self._add_waiting_for_resources_msg(num_nodes)
418
+ self._code.append(
419
+ textwrap.dedent("""\
420
+ # FIXME: This will print the error message from autoscaler if
421
+ # it is waiting for other task to finish. We should hide the
422
+ # error message.
423
+ ray.get(pg.ready())"""))
424
+ self._add_job_started_msg()
425
+
426
+ job_id = self.job_id
427
+ if setup_cmd is not None:
428
+ setup_envs = env_vars.copy()
429
+ setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
430
+ self._code += [
431
+ textwrap.dedent(f"""\
432
+ setup_cmd = {setup_cmd!r}
433
+ _SETUP_CPUS = 0.0001
434
+ # The setup command will be run as a ray task with num_cpus=_SETUP_CPUS as the
435
+ # requirement; this means Ray will set CUDA_VISIBLE_DEVICES to an empty string.
436
+ # We unset it so that user setup command may properly use this env var.
437
+ setup_cmd = 'unset CUDA_VISIBLE_DEVICES; ' + setup_cmd
438
+ job_lib.set_status({job_id!r}, job_lib.JobStatus.SETTING_UP)
439
+
440
+ # The schedule_step should be called after the job status is set to non-PENDING,
441
+ # otherwise, the scheduler will think the current job is not submitted yet, and
442
+ # skip the scheduling step.
443
+ job_lib.scheduler.schedule_step()
444
+
445
+ # If some nodes are down and then new nodes are added after launching again,
446
+ # the result of `ray.nodes()` will include all the nodes, so we need to get
447
+ # the alive nodes.
448
+ alive_nodes = [n for n in ray.nodes() if 'Alive' in n and n['Alive']]
449
+ total_num_nodes = len(alive_nodes)
450
+ setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
451
+ setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
452
+ setup_workers = [run_bash_command_with_log_and_return_pid \\
453
+ .options(
454
+ name='setup',
455
+ num_cpus=_SETUP_CPUS,
456
+ scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
457
+ placement_group=setup_pg,
458
+ placement_group_bundle_index=i)
459
+ ) \\
460
+ .remote(
461
+ setup_cmd,
462
+ os.path.expanduser({setup_log_path!r}),
463
+ env_vars={setup_envs!r},
464
+ stream_logs=True,
465
+ with_ray=True,
466
+ ) for i in range(total_num_nodes)]
467
+ setup_returncodes, setup_pids = get_or_fail(setup_workers, setup_pg)
468
+ success = True
469
+ failed_workers_and_returncodes = []
470
+ for i in range(len(setup_returncodes)):
471
+ returncode = setup_returncodes[i]
472
+ pid = setup_pids[i]
473
+ if pid == None:
474
+ pid = os.getpid()
475
+ if returncode != 0 and returncode != CANCELLED_RETURN_CODE:
476
+ success = False
477
+ failed_workers_and_returncodes.append((pid, returncode))
478
+ if not success:
479
+ msg = f'ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed. '
480
+ msg += f'Failed workers: ' + ', '.join([f'(pid={{pid}}, returncode={{returncode}})' for pid, returncode in failed_workers_and_returncodes])
481
+ msg += f'. See error logs above for more details.{colorama.Style.RESET_ALL}'
482
+ print(msg, flush=True)
483
+ job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
484
+ # This waits for all streaming logs to finish.
485
+ time.sleep(1)
486
+ # Need this to set the job status in ray job to be FAILED.
487
+ sys.exit(1)
488
+ """)
489
+ ]
490
+
491
+ self._code.append(f'job_lib.set_job_started({self.job_id!r})')
492
+ if setup_cmd is None:
493
+ # Need to call schedule_step() to make sure the scheduler
494
+ # schedule the next pending job.
495
+ self._code.append('job_lib.scheduler.schedule_step()')
496
+
497
+ # Export IP and node rank to the environment variables.
498
+ self._code += [
499
+ textwrap.dedent(f"""\
500
+ @ray.remote
501
+ def check_ip():
502
+ return ray.util.get_node_ip_address()
503
+ gang_scheduling_id_to_ip = ray.get([
504
+ check_ip.options(
505
+ num_cpus={task_cpu_demand},
506
+ scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
507
+ placement_group=pg,
508
+ placement_group_bundle_index=i
509
+ )).remote()
510
+ for i in range(pg.bundle_count)
511
+ ])
512
+
513
+ cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
514
+ job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
515
+ job_ip_rank_map = {{ip: i for i, ip in enumerate(job_ip_rank_list)}}
516
+ job_ip_list_str = '\\n'.join(job_ip_rank_list)
517
+ """),
518
+ ]
519
+
520
+ def add_task(self,
521
+ num_nodes: int,
522
+ bash_script: Optional[str],
523
+ task_name: Optional[str],
524
+ resources_dict: Dict[str, float],
525
+ log_dir: str,
526
+ env_vars: Optional[Dict[str, str]] = None) -> None:
527
+ # TODO(zhwu): The resources limitation for multi-node ray.tune and
528
+ # horovod should be considered.
529
+ for i in range(num_nodes):
530
+ # Ray's per-node resources, to constrain scheduling each command to
531
+ # the corresponding node, represented by private IPs.
532
+ self._add_ray_task(bash_script=bash_script,
533
+ task_name=task_name,
534
+ resources_dict=resources_dict.copy(),
535
+ log_dir=log_dir,
536
+ env_vars=env_vars,
537
+ gang_scheduling_id=i)
538
+
539
+ def _add_ray_task(self,
540
+ bash_script: Optional[str],
541
+ task_name: Optional[str],
542
+ resources_dict: Dict[str, float],
543
+ log_dir: str,
544
+ env_vars: Optional[Dict[str, str]] = None,
545
+ gang_scheduling_id: int = 0) -> None:
546
+ """Generates code for a ray remote task that runs a bash command."""
547
+ assert self._has_setup, 'Call add_setup() before add_task().'
548
+
549
+ task_cpu_demand = resources_dict.pop('CPU')
550
+ # Build remote_task.options(...)
551
+ # resources=...
552
+ # num_gpus=...
553
+ options = []
554
+ options.append(f'num_cpus={task_cpu_demand}')
555
+
556
+ acc_name, acc_count = self._get_accelerator_details(resources_dict)
557
+ num_gpus = 0.0
558
+ if acc_name is not None:
559
+ assert resources_dict, ('There can only be one type of accelerator '
560
+ 'per instance.')
561
+ options.append(f'resources={json.dumps(resources_dict)}')
562
+ if not accelerator_registry.is_schedulable_non_gpu_accelerator(
563
+ acc_name):
564
+ num_gpus = acc_count
565
+ options.append(f'num_gpus={num_gpus}')
566
+ options.append(
567
+ 'scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(' # pylint: disable=line-too-long
568
+ 'placement_group=pg, '
569
+ f'placement_group_bundle_index={gang_scheduling_id})')
570
+
571
+ sky_env_vars_dict_str = [
572
+ textwrap.dedent(f"""\
573
+ sky_env_vars_dict = {{}}
574
+ sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
575
+ sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
576
+ """)
577
+ ]
578
+
579
+ if env_vars is not None:
580
+ sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
581
+ for k, v in env_vars.items())
582
+ sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
583
+
584
+ options_str = ', '.join(options)
585
+ logger.debug('Added Task with options: '
586
+ f'{options_str}')
587
+ rclone_flush_script = self._get_rclone_flush_script()
588
+ unset_ray_env_vars = ' && '.join(
589
+ [f'unset {var}' for var in UNSET_RAY_ENV_VARS])
590
+ self._code += [
591
+ sky_env_vars_dict_str,
592
+ textwrap.dedent(f"""\
593
+ script = {bash_script!r}
594
+ rclone_flush_script = {rclone_flush_script!r}
595
+
596
+ if script is not None:
597
+ script=f'{unset_ray_env_vars}; {{script}}'
598
+ script += rclone_flush_script
599
+ sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
600
+
601
+ ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
602
+ rank = job_ip_rank_map[ip]
603
+
604
+ if len(cluster_ips_to_node_id) == 1: # Single-node task on single-node cluter
605
+ name_str = '{task_name},' if {task_name!r} != None else 'task,'
606
+ log_path = os.path.expanduser(os.path.join({log_dir!r}, 'run.log'))
607
+ else: # Single-node or multi-node task on multi-node cluster
608
+ idx_in_cluster = cluster_ips_to_node_id[ip]
609
+ if cluster_ips_to_node_id[ip] == 0:
610
+ node_name = 'head'
611
+ else:
612
+ node_name = f'worker{{idx_in_cluster}}'
613
+ name_str = f'{{node_name}}, rank={{rank}},'
614
+ log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
615
+ sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
616
+
617
+ sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
618
+
619
+ futures.append(run_bash_command_with_log_and_return_pid \\
620
+ .options(name=name_str, {options_str}) \\
621
+ .remote(
622
+ script,
623
+ log_path,
624
+ env_vars=sky_env_vars_dict,
625
+ stream_logs=True,
626
+ with_ray=True,
627
+ ))""")
628
+ ]
629
+
630
+ def add_epilogue(self) -> None:
631
+ """Generates code that waits for all tasks, then exits."""
632
+ self._code.append('returncodes, _ = get_or_fail(futures, pg)')
633
+ super().add_epilogue()